def words2ngrams(sep, num, tokens): '''Convert word tokens into ngrams. ngrams are n-length word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) ngrams = list(nltk.ngrams(content, num)) write_csv(ngrams, str(sep))
def tokens2pos(sep, tokens): '''Tokenize words into their parts of speech. Output contains the word token followed by its part-of-speech tag, separated by the character specified by --sep. ''' content = read_tokens(tokens) nltk.data.path.append(data_item()) tags = nltk.pos_tag(content) write_csv(tags, str(sep))
def text2ngrams(sep, num, text): '''Tokenize plain text into ngrams. ngrams are n-length word tokens. Punctuation is considered as a separate token.''' content = '\n'.join([open(f).read() for f in text]) try: tokens = nltk.word_tokenize(content) ngrams = list(nltk.ngrams(tokens, num)) write_csv(ngrams, str(sep)) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err)
def tokens2counts(sep, limit, tokens): '''Count unique tokens in a list of tokens. Tokens are sorted by top counts.''' content = read_tokens(tokens) counts = sort_counts(get_counts(content)) # we want the argument type to be an INT - but python only # has support for a float infinity. So if it the limit is negative, # it becomes infinite if limit < 0: limit = float('inf') # using csv writer to ensure proper encoding of the seperator. rows = [list(map(str, vals)) for ind, vals in enumerate(counts) if ind < limit] write_csv(rows, str(sep))
def tokens2topbigrams(sep, measure, freq, scores, tokens): '''Find top most interesting bi-grams in a token document. Uses the --measure argument to determine what measure to use to define 'interesting'. ''' content = read_tokens(tokens) bcf = nltk.collocations.BigramCollocationFinder.from_words(content) bcf.apply_freq_filter(freq) nltk_measure = MEASURES[measure] bigrams = bcf.score_ngrams(nltk_measure) out = [b[0] for b in bigrams] if scores: out = [b[0] + tuple([str(b[1])]) for b in bigrams] write_csv(out, str(sep))