def filterwords(language, custom, tokens): '''Remove stop words from tokens, returning tokens without stop words.''' content = read_tokens(tokens) stopwords = get_stopwords(language) if custom: stopwords = stopwords + read_tokens(custom) [output(token) for token in content if token.lower() not in stopwords]
def words2ngrams(sep, length, tokens): '''Tokenize words into ngrams. ngrams are n-length word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) ngrams = list(nltk.ngrams(content, length)) [output(sep.join(ngram)) for ngram in ngrams]
def test_read_tokens(): f = open('test_data/word_tokens.txt', 'r') tokens = read_tokens(f) assert len(tokens) == 6 f.close()
def words2ngrams(sep, num, tokens): '''Convert word tokens into ngrams. ngrams are n-length word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) ngrams = list(nltk.ngrams(content, num)) write_csv(ngrams, str(sep))
def words2bigrams(sep, tokens): '''Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) bigrams = list(nltk.bigrams(content)) [output(sep.join(bigram)) for bigram in bigrams]
def tokens2pos(sep, tokens): '''Tokenize words into their parts of speech. Each item is the original word with its role as the second part of the item. Punctuation is considered as a separate token.''' content = read_tokens(tokens) nltk.data.path.append(data_item()) tags = nltk.pos_tag(content) [output("{},{}".format(t[0], t[1])) for t in tags]
def tokens2pos(sep, tokens): '''Tokenize words into their parts of speech. Each item is the original word with its role as the second part of the item. Punctuation is considered as a separate token.''' content = read_tokens(tokens) nltk.data.path.append(data_item()) tags = nltk.pos_tag(content) [output("{},{}".format(t[0], t[1])) for t in tags]
def tokens2pos(sep, tokens): '''Tokenize words into their parts of speech. Output contains the word token followed by its part-of-speech tag, separated by the character specified by --sep. ''' content = read_tokens(tokens) nltk.data.path.append(data_item()) tags = nltk.pos_tag(content) write_csv(tags, str(sep))
def tokens2pos(sep, tokens): '''Tokenize words into their parts of speech. Output contains the word token followed by its part-of-speech tag, separated by the character specified by --sep. ''' content = read_tokens(tokens) nltk.data.path.append(data_item()) tags = nltk.pos_tag(content) write_csv(tags, str(sep))
def tokens2stem(tokens, algorithm): '''Stem a list of tokens to get their root.''' content = read_tokens(tokens) stemmer = ALGOS[algorithm]() if algorithm == 'wordnet': for token in content: output(stemmer.lemmatize(token)) else: for token in content: output(stemmer.stem(token))
def words2bigrams(sep, tokens): '''Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) bigrams = [] try: bigrams = list(nltk.bigrams(content)) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(sep.join(bigram)) for bigram in bigrams]
def words2bigrams(sep, tokens): """Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is considered as a separate token.""" content = read_tokens(tokens) bigrams = [] try: bigrams = list(nltk.bigrams(content)) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message='Have you run "textkit download"?', nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(sep.join(bigram)) for bigram in bigrams]
def tokens2counts(sep, limit, tokens): '''Count unique tokens in a list of tokens. Tokens are sorted by top counts.''' content = read_tokens(tokens) counts = sort_counts(get_counts(content)) # we want the argument type to be an INT - but python only # has support for a float infinity. So if it the limit is negative, # it becomes infinite if limit < 0: limit = float('inf') # using csv writer to ensure proper encoding of the seperator. rows = [list(map(str, vals)) for ind, vals in enumerate(counts) if ind < limit] write_csv(rows, str(sep))
def tokens2topbigrams(sep, measure, freq, scores, tokens): '''Find top most interesting bi-grams in a token document. Uses the --measure argument to determine what measure to use to define 'interesting'. ''' content = read_tokens(tokens) bcf = nltk.collocations.BigramCollocationFinder.from_words(content) bcf.apply_freq_filter(freq) nltk_measure = MEASURES[measure] bigrams = bcf.score_ngrams(nltk_measure) out = [b[0] for b in bigrams] if scores: out = [b[0] + tuple([str(b[1])]) for b in bigrams] write_csv(out, str(sep))
def tokens2topbigrams(sep, measure, freq, scores, tokens): '''Find top most interesting bi-grams in a token document. Uses the --measure argument to determine what measure to use to define 'interesting'. ''' content = read_tokens(tokens) bcf = nltk.collocations.BigramCollocationFinder.from_words(content) bcf.apply_freq_filter(freq) nltk_measure = MEASURES[measure] bigrams = bcf.score_ngrams(nltk_measure) out = [b[0] for b in bigrams] if scores: out = [b[0] + tuple([str(b[1])]) for b in bigrams] write_csv(out, str(sep))
def tokens2json(ids, names, field, split, sep, token_docs): '''Convert a set of token documents into a JSON array of document objects.''' docs = [] names = read_names(names) ids = read_names(ids) for idx, path in enumerate(token_docs): if path == '-': tokens_doc = sys.stdin else: tokens_doc = open(path, 'r') if split: content = read_csv(tokens_doc, sep) content = coerce_types(content) else: content = read_tokens(tokens_doc) # ordered so that these attributes stay at the top doc = OrderedDict() if idx < len(ids) - 1: doc['id'] = ids[idx] else: doc['id'] = path if idx < len(names) - 1: doc['name'] = names[idx] else: doc['name'] = path doc[field] = content docs.append(doc) tokens_doc.close() out_content = json.dumps(docs, indent=2) output(out_content)
def uppercase(tokens): '''Transform all tokens to uppercase.''' content = read_tokens(tokens) [output(token.upper()) for token in content]
def filterlengths(minimum, tokens): '''Remove tokens that are shorter then the minimum length provided.''' content = read_tokens(tokens) [output(token) for token in content if len(token) >= minimum]
def filterlengths(minimum, tokens): '''Remove tokens that are shorter then the minimum length provided.''' content = read_tokens(tokens) [output(token) for token in content if len(token) >= minimum]
def uppercase(tokens): '''Transform all tokens to uppercase.''' content = read_tokens(tokens) [output(token.upper()) for token in content]
def get_stopwords(stopword_name): path = data_item('/stopwords/' + stopword_name + '.txt') stopwords = [] with open(path) as filename: stopwords = read_tokens(filename) return stopwords
def read_names(names_path): names = [] if names_path: names_doc = open(names_path, 'r') names = read_tokens(names_doc) return names
def filterpunc(tokens): '''Remove tokens that are only punctuation from a list of tokens.''' content = read_tokens(tokens) [output(token) for token in content if token not in punctuation]
def tokens2lower(tokens): '''Transform all tokens to lowercase.''' content = read_tokens(tokens) [output(token.lower()) for token in content]
def tokens2text(sep, tokens): '''Combine tokens in a token document into a single text file.''' content = read_tokens(tokens) out = sep.join(content) output(out)