def import_regex(regex_file): """Import a single regex rule file Arguments: regex_file {string} -- Path to the regex rule file Returns: classifier_type {string} -- The type of classifier that is required by the rule. E.g RegexClassifier, CaptureClassifier etc.. classifier_args {dictionary} -- A dictionary of arguments that will be used by the classifier regexes {dictionary} -- A dictionary that maps the rule to a list of regexes for that rule """ regexes = {} # TODO: Check if tuple unpacking like this is an issue if regex_file.endswith(".txt"): classifier_type, classifier_args, class_name, regexes[class_name] = \ di.regexes_from_csv(regex_file, use_custom_score=True) else: classifier_type = None classifier_args = {} for pack in di.regexes_from_json2(regex_file, use_custom_score=True): classifier_type, classifier_args, class_name, regexes[ class_name] = pack classifier_type = "RegexClassifier" if not classifier_type else classifier_type return classifier_type, classifier_args, regexes
def import_regexes3(regexes_directory, mode="advanced"): file_names = os.listdir(regexes_directory) settings_files = ["rule_settings.json", "rule_properties.json"] ext = ".txt" if mode == "advanced" else ".json" regex_filenames = [ fname for fname in file_names if fname.endswith(ext) and fname not in settings_files ] regex_filenames = [ os.path.join(regexes_directory, fname) for fname in regex_filenames ] regexes = {} for file in regex_filenames: _class_name, regexes[_class_name] = di.regexes_from_csv(file, use_custom_score=True) if file.endswith(".txt")\ else di.regexes_from_json(file, use_custom_score=True) if "rule_settings.json" not in file_names: classifier_args = {} classifier_type = "RegexClassifier" else: classifier_type, classifier_args = di.read_classifier_settings( os.path.join(regexes_directory, "rule_settings.json")) return classifier_type, classifier_args, regexes
def import_regexes2(regexes_directory): file_names = os.listdir(regexes_directory) regex_filenames = [ os.path.join(regexes_directory, fname) for fname in file_names if fname.endswith(".txt") ] regexes = {} for file in regex_filenames: _, _, _class_name, regexes[_class_name] = di.regexes_from_csv( file, use_custom_score=True) if "rule_settings.json" not in file_names: classifier_args = {} classifier_type = "RegexClassifier" else: classifier_type, classifier_args = di.read_classifier_settings( os.path.join(regexes_directory, "rule_settings.json")) return classifier_type, classifier_args, regexes
def import_regexes(regex_directory): """Import multiple regex rule files which will be used in multiclass classification Arguments: regex_directory {string} -- Path to the directory which contains the rule files for a category e.g smoking status Returns: classifier_type {string} -- The type of classifier that is required by the rule. E.g RegexClassifier, CaptureClassifier etc.. classifier_args {dictionary} -- A dictionary of arguments that will be used by the classifier regexes {dictionary} -- A dictionary that maps the rule to a list of regexes for that rule """ file_names = os.listdir(regex_directory) regex_filenames = [ os.path.join(regex_directory, fname) for fname in file_names ] regexes = {} classifier_type = None classifier_args = {} for file in regex_filenames: if file.endswith(".txt"): _classifier_type, _classifier_args, _class_name, regexes[_class_name] = \ di.regexes_from_csv(file, use_custom_score=True) else: _classifier_type = None _classifier_args = {} for pack in di.regexes_from_json2(file, use_custom_score=True): _classifier_type, _classifier_args, _class_name, regexes[ _class_name] = pack classifier_type = _classifier_type if _classifier_type else classifier_type classifier_args = _classifier_args if _classifier_args else classifier_args classifier_type = "RegexClassifier" if not classifier_type else classifier_type return classifier_type, classifier_args, regexes
from datahandler import data_import as di from stats.basic import calculate_accuracy from web.report_generator import generate_error_report from web.report_generator import generate_classification_report import os if __name__ == "__main__": debug = True #Reading regex files regexes = {} #Importing immigration regexes filename = os.path.join("..","examples","regexes","tb_regexes", "immigration_country.txt") _, _, class_name, regexes[class_name] = di.regexes_from_csv(filename, use_custom_score=True, all_matches=False) if not debug: #Location of TB_DATA_FOLDER print("Current data folder: {!r}\n".format(os.getenv('TB_DATA_FOLDER'))) #Label files and data files data_filenames = [os.path.normpath(os.path.join(os.getenv('TB_DATA_FOLDER'), 'NLP Study (TB Clinic) Cohort 2 (really cleansed).csv'))] label_filenames = [os.path.normpath(os.path.join(os.getenv('TB_DATA_FOLDER'), 'NLP Study (TB Clinic) Manual Chart Extraction - Cohort 2.xlsx'))] print("Files of interest: {!r}\n".format(data_filenames)) print("Files of interest: {!r}\n".format(label_filenames)) #Reading label data and importing data data, _, ids = di.data_from_csv(data_filenames, data_cols=2, id_cols=0, repeat_ids=False) _, temp_labels, temp_ids = di.data_from_excel(label_filenames, id_cols=1, label_cols=7, repeat_ids=False, first_row=2, check_col=1) labels = ["None"] * len(data)