def clean_text(text): if not text: return '' abbreviations = identify_parenthetical_phrases()(text) parsers = [ dedash(), titlecaps(), decaps_text(), unidecoder(), separate_reference(), url_replacement(), replace_acronyms(counter=abbreviations, underscore=False), pos_tokenizer(pre_pos_blacklist), token_replacement(remove=True), replace_from_dictionary(), pos_tokenizer(post_pos_blacklist) ] for parser in parsers: text = parser(text) text = remove_stopwords(text) text = lemmatize(text) return text
def call(self, data): ABBR = identify_parenthetical_phrases()(data) parsers = [ dedash(), # titlecaps(), separate_reference(), unidecoder(), token_replacement(), url_replacement(), # replace_acronyms(ABBR, underscore=False), # separated_parenthesis(), # replace_from_dictionary(prefix="MeSH_") ] cleansed = data for f in parsers: cleansed = f(cleansed) return cleansed.replace('\n', ' ')
def setup_class(cls): cls.parser = unidecoder()
import csv import os import itertools from utils.os_utils import mkdir, grab_files from utils.parallel_utils import jobmap import nlpre import logging logger = logging.getLogger(__name__) # Fix for pathological csv files csv.field_size_limit(sys.maxsize) _ref_counter = itertools.count() parser_unicode = nlpre.unidecoder() def map_to_unicode(s): ''' Convert input string to unicode. Args: s: an input string document Returns s: a copy of the input string in unicode ''' # Helper function to fix input format s = str(s) return s.decode('utf-8', errors='replace')
from argparse import ArgumentParser from nlpre import titlecaps, dedash, identify_parenthetical_phrases from nlpre import replace_acronyms, replace_from_dictionary from nlpre import separated_parenthesis, unidecoder, token_replacement from nlpre import url_replacement, separate_reference if __name__ == '__main__': parser = ArgumentParser() parser.add_argument( "-t", "--text", dest="text", help="The text to clean", metavar="TEXT") args = parser.parse_args() data = args.text or '' ABBR = identify_parenthetical_phrases()(data) parsers = [ dedash(), # titlecaps(), separate_reference(), unidecoder(), token_replacement(), url_replacement(), replace_acronyms(ABBR, underscore=False), separated_parenthesis(), # replace_from_dictionary(prefix="MeSH_") ] cleansed = data for f in parsers: cleansed = f(cleansed) sys.stdout.write(cleansed.replace('\n', ' '))