def tokens2pos(sep, tokens): '''Tokenize words into their parts of speech. Each item is the original word with its role as the second part of the item. Punctuation is considered as a separate token.''' content = read_tokens(tokens) nltk.data.path.append(data_item()) tags = nltk.pos_tag(content) [output("{},{}".format(t[0], t[1])) for t in tags]
def tokens2pos(sep, tokens): '''Tokenize words into their parts of speech. Output contains the word token followed by its part-of-speech tag, separated by the character specified by --sep. ''' content = read_tokens(tokens) nltk.data.path.append(data_item()) tags = nltk.pos_tag(content) write_csv(tags, str(sep))
def get_stopwords(stopword_name): path = data_item('/stopwords/' + stopword_name + '.txt') stopwords = [] with open(path) as filename: stopwords = read_tokens(filename) return stopwords