def calculate_case_statistics(): """Calculate statistics of patient cases.""" words = data.get_stopwords() terms = data.get_medical_terms() print("Case | Lines | Stopwords | Terms | Medical terms") for code, case in sorted(PatientCase.ALL.items()): print(' & '.join((code, str(len(case.text.split('\n'))), str(len([i for i in case.text.split() if i in words])), str(len(case.vector)), str(len([i for i in case.vector.keys() if i in terms])))) + r' \\') print()
def main(script): """Run all the functions in this module.""" data.main() # Populate all objects # Generate a LaTeX table with all stopwords _generate_columned_table(sorted(data.get_stopwords()), 6, 'stopwords', 'Norwegian stopwords') # Generate a LaTeX table with all medical terms _generate_columned_table(sorted(data.get_medical_terms()), 3, 'medicalterms', 'Medical terms') generate_cases_table() calculate_chapter_statistics() calculate_case_statistics()
def parse_case_file(path, stopwords=get_stopwords()): """Read lines from case file in 'path'.""" # Read in lines from case files with open(path) as f: text = [] for line in f.readlines(): line = ' '.join(i for i in line.strip().split(' ') if i.lower() not in stopwords) if line: if line[-1] == '.': line = line[:-1] # Remove period from queries text.append(line) filename, ext = os.path.splitext(os.path.split(path)[1]) PatientCase(filename.replace('case', ''), '\n'.join(text))
from math import log from whoosh.index import create_in, open_dir, exists_in from whoosh.analysis import StandardAnalyzer from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED from whoosh.qparser import QueryParser, OrGroup from data import ATC, ICD, PatientCase, Therapy, populate_all, get_stopwords # Folder to store whoosh index in INDEX_DIR = 'whooshindex' # Analyzer which removes stopwords ANALYZER = StandardAnalyzer(stoplist=get_stopwords()) # Schema for storing and indexing ATC codes in whoosh database ATC_SCHEMA = Schema(code=ID(stored=True), title=TEXT(stored=True)) # Schema for storing and indexing ICD10 codes in whoosh database ICD_SCHEMA = Schema(code=ID(stored=True), short=ID(stored=True), label=TEXT(stored=True, analyzer=ANALYZER), type=TEXT, icpc2_code=ID, icpc2_label=TEXT, synonyms=TEXT, terms=TEXT, inclusions=TEXT, exclusions=TEXT, description=TEXT(analyzer=ANALYZER))