Example #1
0
def import_regex(regex_file):
    """Import a single regex rule file

    Arguments:
        regex_file {string} -- Path to the regex rule file

    Returns:
        classifier_type {string} -- The type of classifier that is required by the rule. E.g RegexClassifier, CaptureClassifier etc..
        classifier_args {dictionary} -- A dictionary of arguments that will be used by the classifier
        regexes {dictionary} -- A dictionary that maps the rule to a list of regexes for that rule
    """

    regexes = {}

    # TODO: Check if tuple unpacking like this is an issue

    if regex_file.endswith(".txt"):
        classifier_type, classifier_args, class_name, regexes[class_name] = \
            di.regexes_from_csv(regex_file, use_custom_score=True)
    else:
        classifier_type = None
        classifier_args = {}
        for pack in di.regexes_from_json2(regex_file, use_custom_score=True):
            classifier_type, classifier_args, class_name, regexes[
                class_name] = pack

    classifier_type = "RegexClassifier" if not classifier_type else classifier_type

    return classifier_type, classifier_args, regexes
Example #2
0
def import_regexes3(regexes_directory, mode="advanced"):
    file_names = os.listdir(regexes_directory)
    settings_files = ["rule_settings.json", "rule_properties.json"]

    ext = ".txt" if mode == "advanced" else ".json"

    regex_filenames = [
        fname for fname in file_names
        if fname.endswith(ext) and fname not in settings_files
    ]

    regex_filenames = [
        os.path.join(regexes_directory, fname) for fname in regex_filenames
    ]

    regexes = {}

    for file in regex_filenames:
        _class_name, regexes[_class_name] = di.regexes_from_csv(file, use_custom_score=True) if file.endswith(".txt")\
            else di.regexes_from_json(file, use_custom_score=True)

    if "rule_settings.json" not in file_names:
        classifier_args = {}
        classifier_type = "RegexClassifier"
    else:
        classifier_type, classifier_args = di.read_classifier_settings(
            os.path.join(regexes_directory, "rule_settings.json"))

    return classifier_type, classifier_args, regexes
Example #3
0
def import_regexes2(regexes_directory):
    file_names = os.listdir(regexes_directory)
    regex_filenames = [
        os.path.join(regexes_directory, fname) for fname in file_names
        if fname.endswith(".txt")
    ]
    regexes = {}

    for file in regex_filenames:
        _, _, _class_name, regexes[_class_name] = di.regexes_from_csv(
            file, use_custom_score=True)

    if "rule_settings.json" not in file_names:
        classifier_args = {}
        classifier_type = "RegexClassifier"
    else:
        classifier_type, classifier_args = di.read_classifier_settings(
            os.path.join(regexes_directory, "rule_settings.json"))

    return classifier_type, classifier_args, regexes
Example #4
0
def import_regexes(regex_directory):
    """Import multiple regex rule files which will be used in multiclass classification

    Arguments:
        regex_directory {string} -- Path to the directory which contains the rule files for a category e.g smoking status

    Returns:
        classifier_type {string} -- The type of classifier that is required by the rule. E.g RegexClassifier, CaptureClassifier etc..
        classifier_args {dictionary} -- A dictionary of arguments that will be used by the classifier
        regexes {dictionary} -- A dictionary that maps the rule to a list of regexes for that rule
    """
    file_names = os.listdir(regex_directory)
    regex_filenames = [
        os.path.join(regex_directory, fname) for fname in file_names
    ]

    regexes = {}

    classifier_type = None
    classifier_args = {}

    for file in regex_filenames:
        if file.endswith(".txt"):
            _classifier_type, _classifier_args, _class_name, regexes[_class_name] = \
                di.regexes_from_csv(file, use_custom_score=True)
        else:
            _classifier_type = None
            _classifier_args = {}
            for pack in di.regexes_from_json2(file, use_custom_score=True):
                _classifier_type, _classifier_args, _class_name, regexes[
                    _class_name] = pack

        classifier_type = _classifier_type if _classifier_type else classifier_type
        classifier_args = _classifier_args if _classifier_args else classifier_args

    classifier_type = "RegexClassifier" if not classifier_type else classifier_type

    return classifier_type, classifier_args, regexes
Example #5
0
from datahandler import data_import as di
from stats.basic import calculate_accuracy
from web.report_generator import generate_error_report
from web.report_generator import generate_classification_report
import os

if __name__ == "__main__":

    debug = True

    #Reading regex files
    regexes = {}

    #Importing immigration regexes
    filename = os.path.join("..","examples","regexes","tb_regexes", "immigration_country.txt")
    _, _, class_name, regexes[class_name] = di.regexes_from_csv(filename, use_custom_score=True, all_matches=False)

    if not debug:
        #Location of TB_DATA_FOLDER
        print("Current data folder: {!r}\n".format(os.getenv('TB_DATA_FOLDER')))

        #Label files and data files
        data_filenames = [os.path.normpath(os.path.join(os.getenv('TB_DATA_FOLDER'), 'NLP Study (TB Clinic) Cohort 2 (really cleansed).csv'))]
        label_filenames = [os.path.normpath(os.path.join(os.getenv('TB_DATA_FOLDER'), 'NLP Study (TB Clinic) Manual Chart Extraction - Cohort 2.xlsx'))]
        print("Files of interest: {!r}\n".format(data_filenames))
        print("Files of interest: {!r}\n".format(label_filenames))

        #Reading label data and importing data
        data, _, ids = di.data_from_csv(data_filenames, data_cols=2, id_cols=0, repeat_ids=False)
        _, temp_labels, temp_ids = di.data_from_excel(label_filenames, id_cols=1, label_cols=7, repeat_ids=False, first_row=2, check_col=1)
        labels = ["None"] * len(data)