def setUp(self): # create a sample image in memory self.context = pyConText.pyConText() self.splitter = helpers.sentenceSplitter() self.su1 = u'kanso <Diagnosis>**diabetes**</Diagnosis> utesl\xf6t eller diabetes men inte s\xe4kert. Vi siktar p\xe5 en r\xf6ntgenkontroll. kan det vara nej panik\xe5ngesten\n?' self.su2 = u'IMPRESSION: 1. LIMITED STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.' self.su3 = u'This is a sentence that does not end with a number. But this sentence ends with 1.'
def setUp(self): # create a sample image in memory self.context = pyConText.ConTextMarkup() self.splitter = helpers.sentenceSplitter() self.su1 = u'kanso <Diagnosis>**diabetes**</Diagnosis> utesl\xf6t eller diabetes men inte s\xe4kert. Vi siktar p\xe5 en r\xf6ntgenkontroll. kan det vara nej panik\xe5ngesten\n?' self.su2 = u'IMPRESSION: 1. LIMITED STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.' self.su3 = u'This is a sentence that does not end with a number. But this sentence ends with 1. So this should be recognized as a third sentence.' self.su4 = u'This is a sentence with a numeric value equal to 1.43 and should not be split into two parts.' self.items = [ [u"pulmonary embolism",u"PULMONARY_EMBOLISM",ur"""pulmonary\s(artery )?(embol[a-z]+)""",""],["no gross evidence of","PROBABLE_NEGATED_EXISTENCE","","forward"]] self.itemData = itemData.itemData() for i in self.items: cit = itemData.contextItem
def setUp(self): # create a sample image in memory self.context = pyConText.ConTextMarkup() self.splitter = helpers.sentenceSplitter() self.su1 = u'kanso <Diagnosis>**diabetes**</Diagnosis> utesl\xf6t eller diabetes men inte s\xe4kert. Vi siktar p\xe5 en r\xf6ntgenkontroll. kan det vara nej panik\xe5ngesten\n?' self.su2 = u'IMPRESSION: 1. LIMITED STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.' self.su3 = u'This is a sentence that does not end with a number. But this sentence ends with 1.' self.items = [[ u"pulmonary embolism", u"PULMONARY_EMBOLISM", ur"""pulmonary\s(artery )?(embol[a-z]+)""", "" ], [ "no gross evidence of", "PROBABLE_NEGATED_EXISTENCE", "", "forward" ]] self.itemData = itemData.itemData() for i in self.items: cit = itemData.contextItem
def main(directory, N): print( f"The path to the file directory specified is {directory} with an N value of {N}." ) print( '----------------------------------------------------------------------------------------------------------' ) # Read in multiple files from directory with glob utility. name_map, all_data = helpers.globber(directory=directory) # Create frequency table as a python dict to analyze all_data and determine the frequency of every word. freq_df = helpers.frequencyCounter(data=all_data) # Eliminate stop words from frequency table. freq_df = helpers.stopWordRemover(file='stop-word-list.csv', frequency_table=freq_df) # Convert all_data from a list to a pandas dataframe. all_data = pd.DataFrame(all_data, columns=['text']) # Merge the name_map that holds all of the truncated names of the documents and the strings of the parsed documents # in all_data to form a comprehensive all_data pandas dataframe. all_data = name_map.join(all_data) # Create split_sentences data structure to hold every document and every sentence the document contains. split_sentences = helpers.sentenceSplitter(data=all_data) # Output configuration. output = helpers.outputConstructor(freq_df=freq_df, split_sentences=split_sentences, data=all_data, n=N) # Extract output to .csv file in the specified directory. output.to_csv('output.csv')
def test_sentenceSplitter2(self): """test whether we properly skip numbers with decimal points.""" splitter = helpers.sentenceSplitter() sentences = splitter.splitSentences(self.su4) assert len(sentences) == 1
def test_sentenceSplitter1(self): """test whether we properly capture text that terminates without a recognized sentence termination""" splitter = helpers.sentenceSplitter() sentences = splitter.splitSentences(self.su3) assert len(sentences) == 3
def test_createSentenceSplitter(self): assert helpers.sentenceSplitter()
def test_sentenceSplitter1(self): """test whether we properly capture text that terminates without a recognized sentence termination""" splitter = helpers.sentenceSplitter() sentences = splitter.splitSentences(self.su3) assert len(sentences) == 2