def texts2json(ids, names, field, text_docs): """Convert a set of text documents into a JSON array of document objects.""" docs = [] names = read_names(names) ids = read_names(ids) for idx, path in enumerate(text_docs): tokens_doc = open(path, "r") content = "" with click.open_file(path): content = tokens_doc.read() # ordered so that these attributes stay at the top doc = OrderedDict() if idx < len(ids) - 1: doc["id"] = ids[idx] else: doc["id"] = path if idx < len(names) - 1: doc["name"] = names[idx] else: doc["name"] = path doc[field] = content docs.append(doc) tokens_doc.close() out_content = json.dumps(docs, indent=2) output(out_content)
def texts2json(ids, names, field, text_docs): '''Convert a set of text documents into a JSON array of document objects.''' docs = [] names = read_names(names) ids = read_names(ids) for idx, path in enumerate(text_docs): tokens_doc = open(path, 'r') content = "" with click.open_file(path): content = tokens_doc.read() # ordered so that these attributes stay at the top doc = OrderedDict() if idx < len(ids) - 1: doc['id'] = ids[idx] else: doc['id'] = path if idx < len(names) - 1: doc['name'] = names[idx] else: doc['name'] = path doc[field] = content docs.append(doc) tokens_doc.close() out_content = json.dumps(docs, indent=2) output(out_content)
def tokens2stem(tokens, algorithm): '''Stem a list of tokens to get their root.''' content = read_tokens(tokens) stemmer = ALGOS[algorithm]() if algorithm == 'wordnet': for token in content: output(stemmer.lemmatize(token)) else: for token in content: output(stemmer.stem(token))
def words2ngrams(sep, length, tokens): '''Tokenize words into ngrams. ngrams are n-length word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) ngrams = list(nltk.ngrams(content, length)) [output(sep.join(ngram)) for ngram in ngrams]
def words2bigrams(sep, tokens): '''Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) bigrams = list(nltk.bigrams(content)) [output(sep.join(bigram)) for bigram in bigrams]
def filterwords(language, custom, tokens): '''Remove stop words from tokens, returning tokens without stop words.''' content = read_tokens(tokens) stopwords = get_stopwords(language) if custom: stopwords = stopwords + read_tokens(custom) [output(token) for token in content if token.lower() not in stopwords]
def tokens2pos(sep, tokens): '''Tokenize words into their parts of speech. Each item is the original word with its role as the second part of the item. Punctuation is considered as a separate token.''' content = read_tokens(tokens) nltk.data.path.append(data_item()) tags = nltk.pos_tag(content) [output("{},{}".format(t[0], t[1])) for t in tags]
def top_bigrams(sep, measure, freq, scores, tokens): '''Find top most interesting bi-grams in a token document. Uses the --measure argument to determine what measure to use to define 'interesting'. ''' output(sep) content = read_tokens(tokens) bcf = nltk.collocations.BigramCollocationFinder.from_words(content) bcf.apply_freq_filter(freq) nltk_measure = MEASURES[measure] bigrams = bcf.score_ngrams(nltk_measure) out = [b[0] for b in bigrams] if scores: out = [b[0] + tuple([str(b[1])]) for b in bigrams] [output(sep.join(line)) for line in out]
def transliterate(file): '''Convert international text to ascii.''' content = ''.join(file.readlines()) try: content = content.decode(chardet.detect(content)['encoding']) except AttributeError: # Strings do not have a decode method in python 3. pass [output(unidecode(content).encode('ascii', 'ignore'))]
def text2punc(text): '''Tokenize text into punctuation tokens. Words and numbers are removed, leaving only punctuation.''' # from: http://stackoverflow.com/questions/17485092/how-to-just-keep-punctuation-with-a-string-in-python content = '\n'.join([open(f).read() for f in text]) out = re.sub(r'[^{}]+'.format(punctuation), ' ', content) out = out.split() [output(p) for p in out]
def text2sentences(text): '''Tokenize text into sentence tokens.''' content = '\n'.join([open(f).read() for f in text]) sentences = [] try: sentences = sent_tokenize(content) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(s.strip()) for s in sentences]
def text2sentences(text): """Tokenize text into sentence tokens.""" content = "\n".join([open(f).read() for f in text]) sentences = [] try: sentences = sent_tokenize(content) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message='Have you run "textkit download"?', nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(s.strip()) for s in sentences]
def tokens2json(ids, names, field, split, sep, token_docs): '''Convert a set of token documents into a JSON array of document objects.''' docs = [] names = read_names(names) ids = read_names(ids) for idx, path in enumerate(token_docs): if path == '-': tokens_doc = sys.stdin else: tokens_doc = open(path, 'r') if split: content = read_csv(tokens_doc, sep) content = coerce_types(content) else: content = read_tokens(tokens_doc) # ordered so that these attributes stay at the top doc = OrderedDict() if idx < len(ids) - 1: doc['id'] = ids[idx] else: doc['id'] = path if idx < len(names) - 1: doc['name'] = names[idx] else: doc['name'] = path doc[field] = content docs.append(doc) tokens_doc.close() out_content = json.dumps(docs, indent=2) output(out_content)
def text2words(text): '''Tokenize text into word tokens. Punctuation is considered as a separate token.''' content = '\n'.join([open(f).read() for f in text]) tokens = [] try: tokens = nltk.word_tokenize(content) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(token) for token in tokens]
def words2bigrams(sep, tokens): """Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is considered as a separate token.""" content = read_tokens(tokens) bigrams = [] try: bigrams = list(nltk.bigrams(content)) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message='Have you run "textkit download"?', nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(sep.join(bigram)) for bigram in bigrams]
def words2bigrams(sep, tokens): '''Tokenize words into bigrams. Bigrams are two word tokens. Punctuation is considered as a separate token.''' content = read_tokens(tokens) bigrams = [] try: bigrams = list(nltk.bigrams(content)) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(sep.join(bigram)) for bigram in bigrams]
def filterpunc(tokens): '''Remove tokens that are only punctuation from a list of tokens.''' content = read_tokens(tokens) [output(token) for token in content if token not in punctuation]
def nonewlines(text): '''Remove newlines from a text file.''' content = '\n'.join([open(f).read() for f in text]) content = re.sub('\n|\r\n|\r', ' ', content).strip() output(content)
def text2words(text): '''Tokenize text into word tokens. Punctuation is considered as a separate token.''' content = '\n'.join([open(f).read() for f in text]) tokens = nltk.word_tokenize(content) [output(token) for token in tokens]
def uppercase(tokens): '''Transform all tokens to uppercase.''' content = read_tokens(tokens) [output(token.upper()) for token in content]
def filterlengths(minimum, tokens): '''Remove tokens that are shorter then the minimum length provided.''' content = read_tokens(tokens) [output(token) for token in content if len(token) >= minimum]
def tokens2text(sep, tokens): '''Combine tokens in a token document into a single text file.''' content = read_tokens(tokens) out = sep.join(content) output(out)
def nonewlines(text): """Remove newlines from a text file.""" content = "\n".join([open(f).read() for f in text]) content = content.replace("\n", " ").strip() output(content)
def showstops(language): '''Display stop words used by textkit for a given language.''' stopwords = get_stopwords(language) [output(token) for token in stopwords]
def text2sentences(text): '''Tokenize text into sentence tokens.''' content = '\n'.join([open(f).read() for f in text]) sentences = sent_tokenize(content) [output(s.strip()) for s in sentences]
def tokens2lower(tokens): '''Transform all tokens to lowercase.''' content = read_tokens(tokens) [output(token.lower()) for token in content]
def nonewlines(text): '''Remove newlines from a text file.''' content = '\n'.join([open(f).read() for f in text]) content = content.replace('\n', ' ').strip() output(content)