def pos_tag_stanford(text, output='str', input_file='temp.txt'): """ Part-of-speech tagging using the Stanford tagger. >>> pos_tag_stanford(u'ses beaux cheveux') u'ses_D beaux_JJ cheveux_NN ' >>> pos_tag_stanford(u'ses beaux cheveux', output='list') [u'ses_D', u'beaux_JJ', u'cheveux_NN'] >>> pos_tag_stanford(u'ses beaux cheveux', output='tuple') [(u'ses', u'D'), (u'beaux', u'JJ'), (u'cheveux', u'NN')] :param text: string or list of words :param output: the output format :param input_file: name of the file that stores the input for the tagger :return: string (default) with an underscore separating the word and the tag, list , or list of word-tag tuples """ if isinstance(text, list): text = ' '.join(text) helpers.write_to_file(text, input_file) script = '../apparatus/stanford-postagger.sh' model = '../stanford/models/french.tagger' tagger = subprocess.Popen([script, model, input_file], shell=False, stdout=subprocess.PIPE) tagged = tagger.communicate()[0] tagged = unicode(tagged, 'utf-8') tagged = tagged.replace('\n', ' ') tuples = helpers.strings_to_tuples(tagged, '_') tuples = correct_tags_stanford(tuples) if output == 'list': return helpers.tuples_to_strings(tuples, output='list') if output == 'tuple': return tuples return helpers.tuples_to_strings(tuples, output='str')
def pos_tag(text, output='tuple', tagger='pos_tag_melt'): """ Part-of-speech tagging. Use this to tag poems. >>> pos_tag(u'ses beaux cheveux') [(u'ses', u'DET'), (u'beaux', u'JJ'), (u'cheveux', u'NN')] >>> pos_tag(u'ses beaux cheveux', output='str') u'ses_DET beaux_JJ cheveux_NN ' :param text: string of raw text :param tagger: the method to use for tagging :return: list of analysed words as word-tag tuples (default), or string with an underscore separating the word and the tag """ preprocessed = preprocess(text) tagged_list = tag_with_apertium(preprocessed, tagger) tuples = [] for item in tagged_list: tupl_list = helpers.strings_to_tuples(item) if len(tupl_list) == 1: if tupl_list[0][0] != '': tuples.append(tupl_list[0]) elif len(tupl_list) > 1: words = '' for i in range(len(tupl_list) - 1): words = words + tupl_list[i][0] + ' ' words = words + tupl_list[-1][0] tuples.append((words, tupl_list[-1][1])) if output == 'str': return helpers.tuples_to_strings(tuples) return tuples
def analyse_morphologically(tagged_text, output='str'): """ Morphological analysis of English using the morpha analyser. >>> analyse_morphologically([('O', 'UH'), ('Helen', 'NNP'), \\ ... ('fair', 'JJ'), ('!', '.'), ('O', 'UH'), ('Helen', 'NNP'), \\ ... ('chaste', 'JJ'), ('!', '.'), ('&', 'CC'), ('If', 'IN'), \\ ... ('I', 'PRP'), ('were', 'VBDR'), ('with', 'IN'), ('thee', 'PRP'), \\ ... (',', ','), ('I', 'PRP'), ('were', 'VBDR'), ('blest', 'VB'), \\ ... ('.', '.')]) 'O_UH Helen_NNP fair_JJ !_. O_UH Helen_NNP chaste_JJ !_. \ &_CC If_IN I_PRP be+ed_VBDR with_IN thee_PRP ,_, \ I_PRP be+ed_VBDR bless+ed_VB ._. \\n' :param tagged_text: POS-tagged text with the tags separated by underscore :param output: the type of the output :return: a string of the form lemma+affix_tag or a list of tuples of the form ('lemma+affix', 'tag') """ if isinstance(tagged_text, list): tagged_text = helpers.tuples_to_strings(tagged_text) path = '../morph/morpha' echo = subprocess.Popen(['echo', tagged_text], shell=False, stdout=subprocess.PIPE) morpha = subprocess.Popen([path, '-act'], shell=False, stdin=echo.stdout, stdout=subprocess.PIPE) result = morpha.communicate()[0] if output == 'tuple': return helpers.strings_to_tuples(result) return result
def pos_tag_stanford(text, input_file='temp.txt'): """ Part-of-speech tagging using the Stanford tagger. >>> pos_tag_stanford('O Helen fair! O Helen chaste!\\n\\ ... If I were with thee, I were blest.') [('O', 'UH'), ('Helen', 'NNP'), ('fair', 'JJ'), ('!', '.'), \ ('O', 'UH'), ('Helen', 'NNP'), ('chaste', 'JJ'), ('!', '.'), \ ('If', 'IN'), ('I', 'PRP'), ('were', 'VBDR'), ('with', 'IN'), \ ('thee', 'PRP'), (',', ','), ('I', 'PRP'), ('were', 'VBDR'), \ ('blest', 'VB'), ('.', '.')] :param text: string or list of words :param input_file: name of the file that stores the input for the tagger :return: list of word-tag tuples """ if isinstance(text, list): text = ' '.join(text) helpers.write_to_file(text, input_file) script = '../apparatus/stanford-postagger.sh' model = '../stanford/models/wsj-0-18-left3words-distsim.tagger' tagger = subprocess.Popen([script, model, input_file], shell=False, stdout=subprocess.PIPE) output = tagger.communicate()[0] output = output.replace('\n', ' ') tuples = helpers.strings_to_tuples(output) corrected = correct_tags(tuples) return corrected
def pos_tag(text, output='tuple', tagger='pos_tag_melt'): """ Part-of-speech tagging. Use this to tag poems. >>> pos_tag(u'ses beaux cheveux') [(u'ses', u'DET'), (u'beaux', u'JJ'), (u'cheveux', u'NN')] >>> pos_tag(u'ses beaux cheveux', output='str') u'ses_DET beaux_JJ cheveux_NN ' :param text: string of raw text :param tagger: the method to use for tagging :return: list of analysed words as word-tag tuples (default), or string with an underscore separating the word and the tag """ preprocessed = preprocess(text) tagged_list = tag_with_apertium(preprocessed, tagger) tuples = [] for item in tagged_list: tupl_list = helpers.strings_to_tuples(item) if len(tupl_list) == 1: if tupl_list[0][0] != '': tuples.append(tupl_list[0]) elif len(tupl_list) > 1: words = '' for i in range(len(tupl_list)-1): words = words + tupl_list[i][0] + ' ' words = words + tupl_list[-1][0] tuples.append((words, tupl_list[-1][1])) if output == 'str': return helpers.tuples_to_strings(tuples) return tuples
def pos_tag_melt(text, output='str', input_file='temp.txt'): """ Part-of-speech tagging using the MElt tagger. >>> pos_tag_melt(u'ses beaux cheveux') u'ses_DET beaux_JJ cheveux_NN ' >>> pos_tag_melt(u'ses beaux cheveux', output='list') [u'ses_DET', u'beaux_JJ', u'cheveux_NN'] >>> pos_tag_melt(u'ses beaux cheveux', output='tuple') [(u'ses', u'DET'), (u'beaux', u'JJ'), (u'cheveux', u'NN')] :param text: string or list of words :param output: the output format :param input_file: name of the file that stores the input for the tagger :return: string (default) with an underscore separating the word and the tag, list , or list of word-tag tuples """ if isinstance(text, list): text = ' '.join(text) helpers.write_to_file(text, input_file) cat = subprocess.Popen(['cat', input_file], shell=False, stdout=subprocess.PIPE) melt = subprocess.Popen\ (['MElt'], shell=False, stdin=cat.stdout, stdout=subprocess.PIPE) tagged = melt.communicate()[0] tagged = unicode(tagged, 'utf-8') tuples = helpers.strings_to_tuples(tagged, '/') tuples = correct_tags_melt(tuples) if output == 'list': return helpers.tuples_to_strings(tuples, output='list') if output == 'tuple': return tuples return helpers.tuples_to_strings(tuples, output='str')
def get_words_with_tag(language, words, pos_tag, tagger, separator='_'): """ Returns the words that are tagged with the given tag. >>> get_words_with_tag('en', ['cat', 'little'], 'NN', 'pos_tag_stanford') [('cat', 'NN')] >>> get_words_with_tag('fr', [(u'jouer_VB'), (u'aimer_VB'), \\ ... (u'peut-être_RB')], 'VB', 'pos_tag_melt') [(u'jouer', u'VB'), (u'aimer', u'VB')] :language: language id, currently en or fr :param words: list of words, optionally tagged <word>_<tag> :param tag: POS-tag :param tagger: method to use for tagging :param separator: the character that separates the word and the tag in the word list :return: list of words of the given part of speech """ selected = [] if len(words) == 0: return selected else: if words[0].find(separator) == -1: if language == 'en': tagged = tag_en.quick_pos_tag(words, tagger=tagger) if language == 'fr': tagged = tag_fr.quick_pos_tag(words, tagger=tagger) if language == 'fi': tagged = tag_fi.quick_pos_tag(words) else: tagged = helpers.strings_to_tuples(words, separator) tagged = helpers.capitalize_nnp(tagged) for token in tagged: if len(token) > 1: if token[1] == pos_tag: selected.append(token) return selected