Ejemplo n.º 1
0
 def test_get_term_parent_hierarchies(self):
     lookup_table = {"parents": {"a": ["b"], "b": ["c"], "d": ["e", "f"], "g": ["h", "i"],
                                 "i": ["j"]}}
     self.assertCountEqual([["z"]],
                           pipeline_helpers.get_term_parent_hierarchies("z", lookup_table))
     self.assertCountEqual([["c"]],
                           pipeline_helpers.get_term_parent_hierarchies("c", lookup_table))
     self.assertCountEqual([["b", "c"]],
                           pipeline_helpers.get_term_parent_hierarchies("b", lookup_table))
     self.assertCountEqual([["a", "b", "c"]],
                           pipeline_helpers.get_term_parent_hierarchies("a", lookup_table))
     self.assertCountEqual([["d", "e"], ["d", "f"]],
                           pipeline_helpers.get_term_parent_hierarchies("d", lookup_table))
     self.assertCountEqual([["g", "h"], ["g", "i", "j"]],
                           pipeline_helpers.get_term_parent_hierarchies("g", lookup_table))
Ejemplo n.º 2
0
def find_match(map_result, ontology_lookup_table):

    mapping_output = map_result["mapping_output"]
    input_to_ontology_mapping = map_result["input_to_ontology_mapping"]
    ontology_to_input_mapping = map_result["ontology_to_input_mapping"]
    input_term_label_map = map_result["input_term_label"]
    ontology_term_label_map = map_result["ontology_term_label"]

    for input_term, mapping_object in mapping_output.items():

        input_term_id = get_term_id(input_term)
        input_term_label = get_term_label(input_term)

        # Standardize sample to lowercase and with punctuation treatment.
        sample = input_term_label.lower()
        sample = helpers.punctuation_treatment(sample)

        # Tokenize sample and remove stop words and 1-letter words
        sample_tokens = word_tokenize(sample)

        # Get "cleaned_sample"
        cleaned_sample = ""
        for token in sample_tokens:
            # Ignore dates
            if helpers.is_date(token) or helpers.is_number(token):
                continue
            # Ignore single letter
            if helpers.is_single_letter(token):
                continue

            # Some preprocessing
            token = helpers.preprocess(token)

            lemma = helpers.singularize_token(
                token, ontology_lookup_table, [])
            lemma = helpers.spelling_correction(
                lemma, ontology_lookup_table, [])
            lemma = helpers.abbreviation_normalization_token(
                lemma, ontology_lookup_table, [])
            lemma = helpers.non_English_normalization_token(
                lemma, ontology_lookup_table, [])

            cleaned_sample = helpers.get_cleaned_sample(
                cleaned_sample, lemma, ontology_lookup_table)
            cleaned_sample = re.sub(' +', ' ', cleaned_sample)
            cleaned_sample = helpers.abbreviation_normalization_phrase(
                cleaned_sample, ontology_lookup_table, [])
            cleaned_sample = helpers.non_English_normalization_phrase(
                cleaned_sample, ontology_lookup_table, [])

        cleaned_sample = helpers.remove_duplicate_tokens(cleaned_sample)

        # Attempt full term match
        full_term_match = helpers.map_term(sample, ontology_lookup_table)

        if not full_term_match:
            # Attempt full term match with cleaned sample
            full_term_match =\
                helpers.map_term(cleaned_sample,
                                 ontology_lookup_table)
        if not full_term_match:
            # Attempt full term match using suffixes
            full_term_match =\
                helpers.map_term(sample,
                                 ontology_lookup_table,
                                 consider_suffixes=True)
        if not full_term_match:
            # Attempt full term match with cleaned sample using suffixes
            full_term_match =\
                helpers.map_term(cleaned_sample,
                                 ontology_lookup_table,
                                 consider_suffixes=True)

        if full_term_match:
            ontology_term_id = full_term_match["id"].upper()
            ontology_term_label = full_term_match["term"]
            ontology_term = "%s:%s" % (ontology_term_label, ontology_term_id)
            mapping_output[input_term] = ontology_term
            input_to_ontology_mapping[input_term_id] = ontology_term_id
            ontology_to_input_mapping[ontology_term_id] = input_term_id
            ontology_term_label_map[ontology_term_id] = full_term_match["term"]
            input_term_label_map[input_term_id] = input_term_label
        else:
            # Attempt various component matches
            component_matches = []
            covered_tokens = set()

            for i in range(5, 0, -1):
                for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i):
                    concat_gram_chunk = " ".join(gram_chunk)
                    gram_tokens = word_tokenize(concat_gram_chunk)
                    gram_permutations =\
                        list(OrderedDict.fromkeys(permutations(
                            concat_gram_chunk.split())))

                    # gram_tokens covered in prior component match
                    if set(gram_tokens) <= covered_tokens:
                        continue

                    for gram_permutation in gram_permutations:
                        gram_permutation_str = " ".join(gram_permutation)
                        component_match =\
                            helpers.map_term(gram_permutation_str,
                                             ontology_lookup_table)
                        if not component_match:
                            # Try again with suffixes
                            component_match =\
                                helpers.map_term(gram_permutation_str,
                                                 ontology_lookup_table,
                                                 consider_suffixes=True)
                        if component_match:
                            component_matches.append(component_match)
                            covered_tokens.update(gram_tokens)
                            break

            # We need should not consider component matches that are
            # ancestral to other component matches.
            ancestors = set()
            for component_match in component_matches:
                component_match_hierarchies =\
                    helpers.get_term_parent_hierarchies(component_match["id"],
                                                        ontology_lookup_table)

                for component_match_hierarchy in component_match_hierarchies:
                    # We do not need the first element
                    component_match_hierarchy.pop(0)

                    ancestors |= set(component_match_hierarchy)

            matched_components = []
            for component_match in component_matches:
                if component_match["id"] not in ancestors:
                    matched_component =\
                        "%s:%s" % (component_match["term"],
                                   component_match["id"])
                    matched_components.append(matched_component)

            # TODO: revisit this step.
            # We do need it, but perhaps the function could be
            #  simplified?
            if len(matched_components):
                matched_components = helpers.retain_phrase(matched_components)

            if matched_components:
                if len(matched_components) == 1:
                    single_value = matched_components[0]
                    onto_term_id = get_term_id(single_value)
                    onto_term_label = get_term_label(single_value)
                    mapping_output[input_term] = single_value
                    input_to_ontology_mapping[input_term_id] = onto_term_id
                    if onto_term_id not in ontology_to_input_mapping:
                        ontology_to_input_mapping[onto_term_id] = input_term_id
                    ontology_term_label_map[onto_term_id] = onto_term_label
                    input_term_label_map[input_term_id] = input_term_label
                else:
                    mapping_output[input_term] = matched_components
                    input_to_ontology_mapping[input_term_id] =\
                        [onto_term_id for onto_term_id in map(
                            lambda s: get_term_id(s), matched_components)]
                    input_term_label_map[input_term_id] = input_term_label
                    for ontology_term in matched_components:
                        onto_term_id = get_term_id(ontology_term)
                        onto_term_label = get_term_label(ontology_term)
                        ontology_term_label_map[onto_term_id] = onto_term_label

    return map_result
Ejemplo n.º 3
0
def classify_sample(sample, matched_terms_with_ids, lookup_table,
                    classification_lookup_table):
    """TODO..."""

    # LexMapr and IFSAC buckets mapped to the parental hierarchies of
    # each element in ``matched_term_with_ids``.
    lexmapr_hierarchy_buckets = []
    ifsac_hierarchy_buckets = []
    # Lowest-level mapping for each element in ``matched_terms_with_ids``.
    lexmapr_final_buckets = []
    ifsac_final_buckets = []
    # IFSAC labels corresponding to the buckets in
    # ``ifsac_final_buckets``.
    ifsac_final_labels = []

    if matched_terms_with_ids:
        for matched_term_with_id in matched_terms_with_ids:
            [_, term_id] = matched_term_with_id.split(":", 1)
            matched_term_hierarchies = get_term_parent_hierarchies(
                term_id, lookup_table)

            for matched_term_hierarchy in matched_term_hierarchies:
                lexmapr_hierarchy_bucket = \
                    classify_sample_helper(matched_term_hierarchy,
                                           classification_lookup_table["buckets_lexmapr"])

                if lexmapr_hierarchy_bucket:
                    lexmapr_hierarchy_buckets.append(lexmapr_hierarchy_bucket)

                    lexmapr_final_bucket_level = min(
                        lexmapr_hierarchy_bucket.keys())
                    lexmapr_final_bucket = lexmapr_hierarchy_bucket[
                        lexmapr_final_bucket_level]
                    if lexmapr_final_bucket not in lexmapr_final_buckets:
                        lexmapr_final_buckets.append(lexmapr_final_bucket)

                ifsac_hierarchy_bucket = \
                    classify_sample_helper(matched_term_hierarchy,
                                           classification_lookup_table["buckets_ifsactop"])

                if ifsac_hierarchy_bucket:
                    ifsac_hierarchy_buckets.append(ifsac_hierarchy_bucket)

                    ifsac_final_bucket_level = min(
                        ifsac_hierarchy_bucket.keys())
                    ifsac_final_bucket = \
                        ifsac_hierarchy_bucket[ifsac_final_bucket_level]
                    if ifsac_final_bucket not in ifsac_final_buckets:
                        ifsac_final_buckets.append(ifsac_final_bucket)

                        # ``ifsac_final_bucket`` has is a one-item
                        # dictionary of the following format:
                        # ``{bucket_id:bucket_label}``.
                        ifsac_final_bucket_id = list(
                            ifsac_final_bucket.keys())[0]

                        ifsac_final_label = \
                            classification_lookup_table["ifsac_labels"][ifsac_final_bucket_id]
                        ifsac_final_labels.append(ifsac_final_label)

        if not ifsac_final_labels or set(ifsac_final_labels) == {"food"}:
            # Attempt to find a classification using ifsac_default
            default_classification = ""
            sample_tokens = word_tokenize(sample)
            sample_tokens = list(
                map(lambda token: singularize(token), sample_tokens))
            for bucket, label in classification_lookup_table[
                    "ifsac_default"].items():
                bucket_tokens = word_tokenize(bucket)
                bucket_tokens = list(
                    map(lambda token: singularize(token), bucket_tokens))
                if not (set(bucket_tokens) - set(sample_tokens)):
                    default_classification = label

            if default_classification:
                ifsac_final_buckets.append("Default classification")
                ifsac_final_labels.append(default_classification)

        ifsac_final_labels = \
            refine_ifsac_final_labels(sample, ifsac_final_labels,
                                      classification_lookup_table["ifsac_refinement"])

    return {
        "lexmapr_hierarchy_buckets": lexmapr_hierarchy_buckets,
        "lexmapr_final_buckets": lexmapr_final_buckets,
        "ifsac_final_buckets": ifsac_final_buckets,
        "ifsac_final_labels": ifsac_final_labels
    }
Ejemplo n.º 4
0
def run(args):
    """
    Main text mining pipeline.
    """
    # If the user specified a profile, we must retrieve args specified
    # by the profile, unless they were explicitly overridden.
    if args.profile:
        args = pipeline_resources.get_profile_args(args)

    # To contain all resources, and their variations, that samples are
    # matched to.  Start by adding pre-defined resources from
    # lexmapr.predefined_resources.
    # TODO: These pre-defined resources are the remnants of early
    #  LexMapr development.  We should eventually move to only adding
    #  terms from online ontologies to lookup tables.
    lookup_table = pipeline_resources.get_predefined_resources()

    # To contain resources fetched from online ontologies, if any.
    # Will eventually be added to ``lookup_table``.
    ontology_lookup_table = None

    if args.config:
        # Fetch online ontology terms specified in config file.
        ontology_lookup_table = pipeline_resources.get_config_resources(
            args.config, args.no_cache)
    elif args.profile:
        # Fetch online ontology terms specified in profile.
        ontology_lookup_table = pipeline_resources.get_profile_resources(
            args.profile)

    if ontology_lookup_table:
        # Merge ``ontology_lookup_table`` into ``lookup_table``
        lookup_table = helpers.merge_lookup_tables(lookup_table,
                                                   ontology_lookup_table)

    # To contain resources used in classification.
    classification_lookup_table = None
    if args.bucket:
        classification_lookup_table = pipeline_resources.get_classification_resources(
        )

    # Output file Column Headings
    output_fields = [
        "Sample_Id", "Sample_Desc", "Cleaned_Sample", "Matched_Components"
    ]

    if args.full:
        output_fields += [
            "Match_Status(Macro Level)", "Match_Status(Micro Level)"
        ]

    if args.bucket:
        if args.full:
            output_fields += [
                "LexMapr Classification (Full List)", "LexMapr Bucket",
                "Third Party Bucket", "Third Party Classification"
            ]
        else:
            output_fields += ["Third Party Classification"]

    fw = open(args.output,
              'w') if args.output else sys.stdout  # Main output file
    fw.write('\t'.join(output_fields))

    # Input file
    fr = open(args.input_file, "r")
    _, ext = os.path.splitext(args.input_file)
    if ext == ".csv":
        fr_reader = csv.reader(fr, delimiter=",")
    elif ext == ".tsv":
        fr_reader = csv.reader(fr, delimiter="\t")
    else:
        raise ValueError("Should not reach here")
    # Skip header
    next(fr_reader)

    # Iterate over samples for matching to ontology terms
    for row in fr_reader:
        sample_id = row[0].strip()
        original_sample = " ".join(row[1:]).strip()
        cleaned_sample = ""
        matched_components = []
        macro_status = "No Match"
        micro_status = []
        lexmapr_classification = []
        lexmapr_bucket = []
        third_party_bucket = []
        third_party_classification = []

        # Standardize sample to lowercase and with punctuation
        # treatment.
        sample = original_sample.lower()
        sample = helpers.punctuation_treatment(sample)

        sample_tokens = word_tokenize(sample)

        # Get ``cleaned_sample``
        for tkn in sample_tokens:
            # Ignore dates
            if helpers.is_date(tkn) or helpers.is_number(tkn):
                continue
            # Some preprocessing
            tkn = helpers.preprocess(tkn)

            lemma = helpers.singularize_token(tkn, lookup_table, micro_status)
            lemma = helpers.spelling_correction(lemma, lookup_table,
                                                micro_status)
            lemma = helpers.abbreviation_normalization_token(
                lemma, lookup_table, micro_status)
            lemma = helpers.non_English_normalization_token(
                lemma, lookup_table, micro_status)

            cleaned_sample = helpers.get_cleaned_sample(
                cleaned_sample, lemma, lookup_table)
            cleaned_sample = re.sub(' +', ' ', cleaned_sample)
            cleaned_sample = helpers.abbreviation_normalization_phrase(
                cleaned_sample, lookup_table, micro_status)
            cleaned_sample = helpers.non_English_normalization_phrase(
                cleaned_sample, lookup_table, micro_status)

        cleaned_sample = helpers.remove_duplicate_tokens(cleaned_sample)

        # Attempt full term match
        full_term_match = helpers.map_term(sample, lookup_table)

        if not full_term_match:
            # Attempt full term match with cleaned sample
            full_term_match = helpers.map_term(cleaned_sample, lookup_table)
            if full_term_match:
                micro_status.insert(0, "Used Cleaned Sample")

        if not full_term_match:
            # Attempt full term match using suffixes
            full_term_match = helpers.map_term(sample,
                                               lookup_table,
                                               consider_suffixes=True)

        if not full_term_match:
            # Attempt full term match with cleaned sample using suffixes
            full_term_match =\
                helpers.map_term(cleaned_sample, lookup_table, consider_suffixes=True)
            if full_term_match:
                micro_status.insert(0, "Used Cleaned Sample")

        if full_term_match:
            matched_components.append(full_term_match["term"] + ":" +
                                      full_term_match["id"])
            macro_status = "Full Term Match"
            micro_status += full_term_match["status"]

            if args.bucket:
                classification_result = classify_sample(
                    sample, matched_components, lookup_table,
                    classification_lookup_table)
                lexmapr_classification = classification_result[
                    "lexmapr_hierarchy_buckets"]
                lexmapr_bucket = classification_result["lexmapr_final_buckets"]
                third_party_bucket = classification_result[
                    "ifsac_final_buckets"]
                third_party_classification = classification_result[
                    "ifsac_final_labels"]
        else:
            # Attempt various component matches
            component_matches = []
            covered_tokens = set()

            for i in range(5, 0, -1):
                for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i):
                    concat_gram_chunk = " ".join(gram_chunk)
                    gram_tokens = word_tokenize(concat_gram_chunk)
                    gram_permutations =\
                        list(OrderedDict.fromkeys(permutations(concat_gram_chunk.split())))

                    # gram_tokens covered in prior component match
                    if set(gram_tokens) <= covered_tokens:
                        continue

                    for gram_permutation in gram_permutations:
                        gram_permutation_str = " ".join(gram_permutation)
                        component_match = helpers.map_term(
                            gram_permutation_str, lookup_table)

                        if not component_match:
                            # Try again with suffixes
                            component_match = helpers.map_term(
                                gram_permutation_str,
                                lookup_table,
                                consider_suffixes=True)

                        if component_match:
                            component_matches.append(component_match)
                            covered_tokens.update(gram_tokens)
                            break

            # We need should not consider component matches that are
            # ancestral to other component matches.
            ancestors = set()
            for component_match in component_matches:
                component_match_hierarchies =\
                    helpers.get_term_parent_hierarchies(component_match["id"], lookup_table)

                for component_match_hierarchy in component_match_hierarchies:
                    # We do not need the first element
                    component_match_hierarchy.pop(0)

                    ancestors |= set(component_match_hierarchy)

            for component_match in component_matches:
                if component_match["id"] not in ancestors:
                    matched_component = component_match[
                        "term"] + ":" + component_match["id"]
                    matched_components.append(matched_component)

            # TODO: revisit this step.
            # We do need it, but perhaps the function could be
            #  simplified?
            if len(matched_components):
                matched_components = helpers.retainedPhrase(matched_components)

            # Finalize micro_status
            # TODO: This is ugly, so revisit after revisiting
            #  ``retainedPhrase``.
            micro_status_covered_matches = set()
            for component_match in component_matches:
                possible_matched_component = component_match[
                    "term"] + ":" + component_match["id"]
                if possible_matched_component in matched_components:
                    if possible_matched_component not in micro_status_covered_matches:
                        micro_status_covered_matches.add(
                            possible_matched_component)
                        micro_status.append("{%s: %s}" %
                                            (component_match["term"],
                                             component_match["status"]))

            if matched_components:
                macro_status = "Component Match"

            if args.bucket:
                classification_result = classify_sample(
                    sample, matched_components, lookup_table,
                    classification_lookup_table)
                lexmapr_classification = classification_result[
                    "lexmapr_hierarchy_buckets"]
                lexmapr_bucket = classification_result["lexmapr_final_buckets"]
                third_party_bucket = classification_result[
                    "ifsac_final_buckets"]
                third_party_classification = classification_result[
                    "ifsac_final_labels"]

        # Write to row
        fw.write("\n" + sample_id + "\t" + original_sample + "\t" +
                 cleaned_sample + "\t" + str(matched_components))

        if args.full:
            fw.write("\t" + macro_status + "\t" + str(micro_status))

        if args.bucket:
            if args.full:
                fw.write("\t" + str(lexmapr_classification) + "\t" +
                         str(lexmapr_bucket) + "\t" + str(third_party_bucket))
            fw.write("\t" + str(sorted(third_party_classification)))

    fw.write('\n')
    # Output files closed
    if fw is not sys.stdout:
        fw.close()
    # Input file closed
    fr.close()