Beispiel #1
0
    def clipped_sentences_test(self):
        doc = "Sunday (the best day) is a day of the week."

        parser1 = separated_parenthesis(min_keep_length=None)
        parser2 = separated_parenthesis(min_keep_length=2)
        parser3 = separated_parenthesis(min_keep_length=5)

        doc2 = 'Sunday is a day of the week .\nthe best day .'
        doc3 = 'Sunday is a day of the week .'

        assert_equals(parser1(doc), doc2)
        assert_equals(parser2(doc), doc2)
        assert_equals(parser3(doc), doc3)
def clean_text(text):
	no_digits = []
	for s in text.split(' '):
		if s.isdigit():
			p = inflect.engine()
			no_digits.append(p.number_to_words(s))
		else:
			no_digits.append(s)
	text = ' '.join(no_digits)
	for f in [nlpre.token_replacement(), nlpre.dedash(), nlpre.separated_parenthesis(), nlpre.replace_acronyms(nlpre.identify_parenthetical_phrases()(text))]: #, nlpre.decaps_text(), nlpre.titlecaps()
		text = f(text)
	if text[-1] == '.' and no_digits[-1][-1] != '.':
		text = text[:-1]
	text = text.replace('\n', ' ')
	return text
Beispiel #3
0
 def setup_class(cls):
     cls.parser = separated_parenthesis(min_keep_length=0)
Beispiel #4
0
 def setup_class(cls):
     cls.parser = separated_parenthesis()
Beispiel #5
0
from argparse import ArgumentParser
from nlpre import titlecaps, dedash, identify_parenthetical_phrases
from nlpre import replace_acronyms, replace_from_dictionary
from nlpre import separated_parenthesis, unidecoder, token_replacement
from nlpre import url_replacement, separate_reference

if __name__ == '__main__':
  parser = ArgumentParser()
  parser.add_argument(
      "-t", "--text", dest="text", help="The text to clean", metavar="TEXT")
  args = parser.parse_args()
  data = args.text or ''

  ABBR = identify_parenthetical_phrases()(data)
  parsers = [
      dedash(),
      # titlecaps(),
      separate_reference(),
      unidecoder(),
      token_replacement(),
      url_replacement(),
      replace_acronyms(ABBR, underscore=False),
      separated_parenthesis(),
      # replace_from_dictionary(prefix="MeSH_")
  ]

  cleansed = data
  for f in parsers:
    cleansed = f(cleansed)

  sys.stdout.write(cleansed.replace('\n', ' '))