def get_file_sentences_and_idf(filename): ''' Get sentences from the document input: filename output:sentenceList''' global sentenceList global stemmedList global n doc=open(filename,'r') # extract the content in the file and store it in data data = "".join(line.rstrip() for line in doc) # lower case the words data = data.lower() # Use BeautifulSoup to read xml format soup = BeautifulSoup(data, 'html.parser') # Extract the text from the respective tags try: text = soup.find("text").get_text() except AttributeError: text = "" sentenceList=('\n'.join(tokenizer.tokenize(text))).split('\n') n = len(sentenceList) calculate_idf()
def Tokenization(data, concept, stem, removeStopwords): if concept == False: data = BeautifulSoup(data).get_text() data = re.sub("\r\n", " ", data) data = re.sub("[^a-zA-Z0-9_]", " ", data) data = data.lower() if stem == True: stemmer = PorterStemmer() data = stemmer.stem(data) words = data.split() if removeStopwords == True: stops = set(stopwords.words("english")) words = [w for w in words if not w in stops] return words
def GetDataFeatures(self, data, dictionary): data_features = { word.lower(): (word in word_tokenize(data.lower())) for word in dictionary } return data_features
def analyze(): ldict = {} result = [] rdict.clear() term_dict.clear() if (len(btn_list) > 0): for btn in btn_list: btn.destroy() btn_list.clear() if (len(lbox_list) > 0): for lbox in lbox_list: lbox.destroy() lbox_list.clear() with open('conditions.csv', newline='') as f: reader = csv.reader(f) data = list(reader) for line in data: #line[1] = line[1].replace(" ","") ldict[line[0]] = line[1:][0].split(',') data = text_box.value data_lower = data.lower() tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') if (preprocess_checkbox.value == 0): remove_history = re.sub('past medical history: diagnosis.*?drug use', ' drug use ', data_lower, flags=re.DOTALL) remove_physicala = re.sub('physical exam.*?assessment/plan', '. ASSESSMENT', remove_history, flags=re.DOTALL) remove_complete = re.sub('physical exam.*?assessment and plan', '. ASSESSMENT', remove_physicala, flags=re.DOTALL) sentence_list = tokenizer.tokenize(remove_complete) else: sentence_list = tokenizer.tokenize(data_lower) for sentence in sentence_list: for (key, terms) in ldict.items(): for term in terms: fixed_word = term.lower() # If have a special date character $date$ # Insert the found dates into the ldict array as a term if fixed_word == "$date$": ldict[key].extend(date_match) continue if fixed_word in sentence: dup_check = [key, sentence] # If a sub term comes up we save it if (sentence, key) in term_dict: term_dict[(sentence, key)].append(fixed_word) else: term_dict[(sentence, key)] = [fixed_word] if dup_check not in result: result.append([key, sentence]) if key in rdict: rdict[key].append(sentence) else: rdict[key] = [sentence] lbox_list.append( ListBox(form_box, items=sentence_list, width="fill", height=resolution[rt]['lbox_height'], command=dispay_full, multiselect=True, scrollbar=True)) lbox_list[0].bg = "#C8D7E9" counter = 0 btn = "" for (k, vl) in ldict.items(): if (counter < 4): if k in rdict: if (platform != 'win32'): btn = PushButton(button_box_r1, align="left", width="10", text=str("█" + k), command=pval, pady=resolution[rt]['button_pady']) else: btn = PushButton(button_box_r1, align="left", width="10", text=k, command=pval, pady=resolution[rt]['button_pady']) btn.update_command(pval, [k, lbox_list[0]]) btn.bg = "#9EF844" else: btn = PushButton(button_box_r1, align="left", width="10", text=k, pady=resolution[rt]['button_pady']) btn.bg = "#F84446" elif (counter < 8): if k in rdict: if (platform != 'win32'): btn = PushButton(button_box_r2, align="left", width="10", text=str("█" + k), command=pval, pady=resolution[rt]['button_pady']) else: btn = PushButton(button_box_r2, align="left", width="10", text=k, command=pval, pady=resolution[rt]['button_pady']) btn.update_command(pval, [k, lbox_list[0]]) btn.bg = "#9EF844" else: btn = PushButton(button_box_r2, align="left", width="10", text=k, pady=resolution[rt]['button_pady']) btn.bg = "#F84446" elif (counter < 12): if k in rdict: if (platform != 'win32'): btn = PushButton(button_box_r3, align="left", width="10", text=str("█" + k), command=pval, pady=resolution[rt]['button_pady']) else: btn = PushButton(button_box_r3, align="left", width="10", text=k, command=pval, pady=resolution[rt]['button_pady']) btn.update_command(pval, [k, lbox_list[0]]) btn.bg = "#9EF844" else: btn = PushButton(button_box_r3, align="left", width="10", text=k, pady=resolution[rt]['button_pady']) btn.bg = "#F84446" elif (counter < 16): if k in rdict: if (platform != 'win32'): btn = PushButton(button_box_r4, align="left", width="10", text=str("█" + k), command=pval, pady=resolution[rt]['button_pady']) else: btn = PushButton(button_box_r4, align="left", width="10", text=k, command=pval, pady=resolution[rt]['button_pady']) btn.update_command(pval, [k, lbox_list[0]]) btn.bg = "#9EF844" else: btn = PushButton(button_box_r4, align="left", width="10", text=k, pady=resolution[rt]['button_pady']) btn.bg = "#F84446" elif (counter < 20): if k in rdict: if (platform != 'win32'): btn = PushButton(button_box_r5, align="left", width="10", text=str("█" + k), command=pval, pady=resolution[rt]['button_pady']) else: btn = PushButton(button_box_r5, align="left", width="10", text=k, command=pval, pady=resolution[rt]['button_pady']) btn.update_command(pval, [k, lbox_list[0]]) btn.bg = "#9EF844" else: btn = PushButton(button_box_r5, align="left", width="10", text=k, pady=resolution[rt]['button_pady']) btn.bg = "#F84446" elif (counter < 24): if k in rdict: if (platform != 'win32'): btn = PushButton(button_box_r6, align="left", width="10", text=str("█" + k), command=pval, pady=resolution[rt]['button_pady']) else: btn = PushButton(button_box_r6, align="left", width="10", text=k, command=pval, pady=resolution[rt]['button_pady']) btn.update_command(pval, [k, lbox_list[0]]) btn.bg = "#9EF844" else: btn = PushButton(button_box_r6, align="left", width="10", text=k, pady=resolution[rt]['button_pady']) btn.bg = "#F84446" elif (counter < 28): if k in rdict: if (platform != 'win32'): btn = PushButton(button_box_r7, align="left", width="10", text=str("█" + k), command=pval, pady=resolution[rt]['button_pady']) else: btn = PushButton(button_box_r7, align="left", width="10", text=k, command=pval, pady=resolution[rt]['button_pady']) btn.update_command(pval, [k, lbox_list[0]]) btn.bg = "#9EF844" else: btn = PushButton(button_box_r7, align="left", width="10", text=k, pady=resolution[rt]['button_pady']) btn.bg = "#F84446" elif (counter < 32): if k in rdict: if (platform != 'win32'): btn = PushButton(button_box_r8, align="left", width="10", text=str("█" + k), command=pval, pady=resolution[rt]['button_pady']) else: btn = PushButton(button_box_r8, align="left", width="10", text=k, command=pval, pady=resolution[rt]['button_pady']) btn.update_command(pval, [k, lbox_list[0]]) btn.bg = "#9EF844" else: btn = PushButton(button_box_r8, align="left", width="10", text=k, pady=resolution[rt]['button_pady']) btn.bg = "#F84446" counter += 1 btn_list.append(btn)