Ejemplo n.º 1
0
def add_predefined_resources_to_lookup_table(lookup_table):
    """Add elements from lexmapr.predefined_resources to lookup table.

    :param lookup_table: See create_lookup_table_skeleton for the
                         expected format of this parameter
    :type lookup_table: dict
    :return: Modified ``lookup_table``
    :rtype: dict
    """
    # Abbreviations of resource terms
    lookup_table["abbreviations"] = get_resource_dict("AbbLex.csv")
    # Non-english translations of resource terms
    lookup_table["non_english_words"] = get_resource_dict("NefLex.csv")
    # Common misspellings of resource terms
    lookup_table["spelling_mistakes"] = get_resource_dict("ScorLex.csv")
    # Terms excluded from inflection treatment
    lookup_table["inflection_exceptions"] = get_resource_dict(
        "inflection-exceptions.csv")
    # Constrained list of stop words considered to be meaningless
    lookup_table["stop_words"] = get_resource_dict("mining-stopwords.csv")
    # Suffixes to consider appending to terms when mining ontologies
    lookup_table["suffixes"] = get_resource_dict("suffixes.csv")

    lookup_table["synonyms"] = get_resource_dict("SynLex.csv")
    lookup_table["synonyms"] = {
        punctuation_treatment(k): punctuation_treatment(v)
        for k, v in lookup_table["synonyms"].items()
    }

    lookup_table["non_standard_resource_ids"] = get_resource_dict(
        "CombinedResourceTerms.csv")

    lookup_table["standard_resource_labels"] = {
        punctuation_treatment(v): k
        for k, v in lookup_table["non_standard_resource_ids"].items()
    }

    for label in lookup_table["standard_resource_labels"]:
        resource_id = lookup_table["standard_resource_labels"][label]
        label_tokens = word_tokenize(label)
        # To limit performance overhead, we ignore resource labels with
        # more than 7 tokens, as permutating too many tokens can be
        # costly. We also ignore NCBI taxon terms, as there are
        # ~160000 such terms.
        if len(label_tokens) < 7 and "ncbitaxon" not in resource_id:
            label_permutations = get_resource_label_permutations(label)
            for permutation in label_permutations:
                lookup_table["standard_resource_label_permutations"][
                    permutation] = resource_id
    return lookup_table
Ejemplo n.º 2
0
 def test_punctuationTreatment(self):
     """Tests punctuationTreatment."""
     # Empty input string
     self.assertEqual(pipeline_helpers.punctuation_treatment(""), "")
     # Single-token input string with no punctuation
     self.assertEqual(pipeline_helpers.punctuation_treatment("foo"), "foo")
     # Multi-token input string with no punctuation
     self.assertEqual(pipeline_helpers.punctuation_treatment("foo bar"), "foo bar")
     # Single-token input string with punctuation
     self.assertEqual(pipeline_helpers.punctuation_treatment("_foo-bar_"), "foo bar")
     # Multi-token input string with punctuation
     self.assertEqual(pipeline_helpers.punctuation_treatment("_foo;ba r_"), "foo ba r")
     # Multi-token input string with number and punctuation
     self.assertEqual(pipeline_helpers.punctuation_treatment("a-b -1"), "a b 1")
Ejemplo n.º 3
0
def add_fetched_ontology_to_lookup_table(lookup_table, fetched_ontology):
    """Add terms from fetched_ontology to lookup_table.

    lookup_table can be used to map terms in run. See
    create_lookup_table_skeleton for the expected format of
    lookup_table.

    :param lookup_table: See create_lookup_table_skeleton for the
                         expected format of this parameter
    :param fetched_ontology: See JSON output of ontofetch.py for the
                             expected format of this parameter
    :type lookup_table: dict
    :type fetched_ontology: dict
    :return: Modified ``lookup_table``
    :rtype: dict
    """
    # Parse content from fetched_ontology and add it to lookup_table
    for resource in fetched_ontology["specifications"].values():
        if "id" in resource and "label" in resource:
            resource_id = resource["id"].lower()
            resource_label = resource["label"].lower()

            # ID value should match format of pre-defined resources
            resource_id = resource_id.replace(":", "_")
            lookup_table["non_standard_resource_ids"][
                resource_id] = resource_label

            # Standardize label
            resource_label = punctuation_treatment(resource_label)
            lookup_table["standard_resource_labels"][
                resource_label] = resource_id

            # List of tokens in resource_label
            resource_tokens = word_tokenize(resource_label)
            # Add permutations if there are less than seven tokens.
            # Permutating more tokens than this can lead to performance
            # issues.
            if len(resource_tokens) < 7:
                permutations = get_resource_label_permutations(resource_label)
                for permutation in permutations:
                    lookup_table["standard_resource_label_permutations"][
                        permutation] = resource_id

            if "oboInOwl:hasSynonym" in resource:
                synonyms = resource["oboInOwl:hasSynonym"]
                for synonym in synonyms:
                    # Standardize synonym
                    synonym = punctuation_treatment(synonym.lower())

                    lookup_table["synonyms"][synonym] = resource_label

            if "oboInOwl:hasNarrowSynonym" in resource:
                synonyms = resource["oboInOwl:hasNarrowSynonym"]
                for synonym in synonyms:
                    # Standardize synonym
                    synonym = punctuation_treatment(synonym.lower())

                    lookup_table["synonyms"][synonym] = resource_label

            if "oboInOwl:hasExactSynonym" in resource:
                synonyms = resource["oboInOwl:hasExactSynonym"]
                for synonym in synonyms:
                    # Standardize synonym
                    synonym = punctuation_treatment(synonym.lower())

                    lookup_table["synonyms"][synonym] = resource_label

            if "parent_id" in resource:
                # Standardize parent_id
                parent_id = resource["parent_id"].replace(":", "_")
                parent_id = parent_id.lower()

                # Bug in ``ontofetch.py``--sometimes a resource is
                # parent to itself. Remove when fixed.
                if resource_id == parent_id:
                    continue
                # Instead of overwriting parents like we do with
                # synonyms, we will concatenate parents from different
                # fetches.
                elif resource_id in lookup_table["parents"]:
                    # Prevent duplicates
                    if parent_id not in lookup_table["parents"][resource_id]:
                        lookup_table["parents"][resource_id] += [parent_id]
                else:
                    lookup_table["parents"][resource_id] = [parent_id]

                if "other_parents" in resource:
                    # Standardize values
                    other_parents = list(
                        map(lambda x: x.replace(":", "_").lower(),
                            resource["other_parents"]))

                    # Prevent duplicates
                    other_parents = list(
                        filter(
                            lambda x: x not in lookup_table["parents"][
                                resource_id], other_parents))

                    # Bug in ``ontofetch.py``--sometimes a resource is
                    # parent to itself. Remove when fixed.
                    other_parents = list(
                        filter(lambda x: x != resource_id, other_parents))

                    lookup_table["parents"][resource_id] += other_parents

    return lookup_table
Ejemplo n.º 4
0
def find_match(map_result, ontology_lookup_table):

    mapping_output = map_result["mapping_output"]
    input_to_ontology_mapping = map_result["input_to_ontology_mapping"]
    ontology_to_input_mapping = map_result["ontology_to_input_mapping"]
    input_term_label_map = map_result["input_term_label"]
    ontology_term_label_map = map_result["ontology_term_label"]

    for input_term, mapping_object in mapping_output.items():

        input_term_id = get_term_id(input_term)
        input_term_label = get_term_label(input_term)

        # Standardize sample to lowercase and with punctuation treatment.
        sample = input_term_label.lower()
        sample = helpers.punctuation_treatment(sample)

        # Tokenize sample and remove stop words and 1-letter words
        sample_tokens = word_tokenize(sample)

        # Get "cleaned_sample"
        cleaned_sample = ""
        for token in sample_tokens:
            # Ignore dates
            if helpers.is_date(token) or helpers.is_number(token):
                continue
            # Ignore single letter
            if helpers.is_single_letter(token):
                continue

            # Some preprocessing
            token = helpers.preprocess(token)

            lemma = helpers.singularize_token(
                token, ontology_lookup_table, [])
            lemma = helpers.spelling_correction(
                lemma, ontology_lookup_table, [])
            lemma = helpers.abbreviation_normalization_token(
                lemma, ontology_lookup_table, [])
            lemma = helpers.non_English_normalization_token(
                lemma, ontology_lookup_table, [])

            cleaned_sample = helpers.get_cleaned_sample(
                cleaned_sample, lemma, ontology_lookup_table)
            cleaned_sample = re.sub(' +', ' ', cleaned_sample)
            cleaned_sample = helpers.abbreviation_normalization_phrase(
                cleaned_sample, ontology_lookup_table, [])
            cleaned_sample = helpers.non_English_normalization_phrase(
                cleaned_sample, ontology_lookup_table, [])

        cleaned_sample = helpers.remove_duplicate_tokens(cleaned_sample)

        # Attempt full term match
        full_term_match = helpers.map_term(sample, ontology_lookup_table)

        if not full_term_match:
            # Attempt full term match with cleaned sample
            full_term_match =\
                helpers.map_term(cleaned_sample,
                                 ontology_lookup_table)
        if not full_term_match:
            # Attempt full term match using suffixes
            full_term_match =\
                helpers.map_term(sample,
                                 ontology_lookup_table,
                                 consider_suffixes=True)
        if not full_term_match:
            # Attempt full term match with cleaned sample using suffixes
            full_term_match =\
                helpers.map_term(cleaned_sample,
                                 ontology_lookup_table,
                                 consider_suffixes=True)

        if full_term_match:
            ontology_term_id = full_term_match["id"].upper()
            ontology_term_label = full_term_match["term"]
            ontology_term = "%s:%s" % (ontology_term_label, ontology_term_id)
            mapping_output[input_term] = ontology_term
            input_to_ontology_mapping[input_term_id] = ontology_term_id
            ontology_to_input_mapping[ontology_term_id] = input_term_id
            ontology_term_label_map[ontology_term_id] = full_term_match["term"]
            input_term_label_map[input_term_id] = input_term_label
        else:
            # Attempt various component matches
            component_matches = []
            covered_tokens = set()

            for i in range(5, 0, -1):
                for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i):
                    concat_gram_chunk = " ".join(gram_chunk)
                    gram_tokens = word_tokenize(concat_gram_chunk)
                    gram_permutations =\
                        list(OrderedDict.fromkeys(permutations(
                            concat_gram_chunk.split())))

                    # gram_tokens covered in prior component match
                    if set(gram_tokens) <= covered_tokens:
                        continue

                    for gram_permutation in gram_permutations:
                        gram_permutation_str = " ".join(gram_permutation)
                        component_match =\
                            helpers.map_term(gram_permutation_str,
                                             ontology_lookup_table)
                        if not component_match:
                            # Try again with suffixes
                            component_match =\
                                helpers.map_term(gram_permutation_str,
                                                 ontology_lookup_table,
                                                 consider_suffixes=True)
                        if component_match:
                            component_matches.append(component_match)
                            covered_tokens.update(gram_tokens)
                            break

            # We need should not consider component matches that are
            # ancestral to other component matches.
            ancestors = set()
            for component_match in component_matches:
                component_match_hierarchies =\
                    helpers.get_term_parent_hierarchies(component_match["id"],
                                                        ontology_lookup_table)

                for component_match_hierarchy in component_match_hierarchies:
                    # We do not need the first element
                    component_match_hierarchy.pop(0)

                    ancestors |= set(component_match_hierarchy)

            matched_components = []
            for component_match in component_matches:
                if component_match["id"] not in ancestors:
                    matched_component =\
                        "%s:%s" % (component_match["term"],
                                   component_match["id"])
                    matched_components.append(matched_component)

            # TODO: revisit this step.
            # We do need it, but perhaps the function could be
            #  simplified?
            if len(matched_components):
                matched_components = helpers.retain_phrase(matched_components)

            if matched_components:
                if len(matched_components) == 1:
                    single_value = matched_components[0]
                    onto_term_id = get_term_id(single_value)
                    onto_term_label = get_term_label(single_value)
                    mapping_output[input_term] = single_value
                    input_to_ontology_mapping[input_term_id] = onto_term_id
                    if onto_term_id not in ontology_to_input_mapping:
                        ontology_to_input_mapping[onto_term_id] = input_term_id
                    ontology_term_label_map[onto_term_id] = onto_term_label
                    input_term_label_map[input_term_id] = input_term_label
                else:
                    mapping_output[input_term] = matched_components
                    input_to_ontology_mapping[input_term_id] =\
                        [onto_term_id for onto_term_id in map(
                            lambda s: get_term_id(s), matched_components)]
                    input_term_label_map[input_term_id] = input_term_label
                    for ontology_term in matched_components:
                        onto_term_id = get_term_id(ontology_term)
                        onto_term_label = get_term_label(ontology_term)
                        ontology_term_label_map[onto_term_id] = onto_term_label

    return map_result
Ejemplo n.º 5
0
def run(args):
    """
    Main text mining pipeline.
    """
    # If the user specified a profile, we must retrieve args specified
    # by the profile, unless they were explicitly overridden.
    if args.profile:
        args = pipeline_resources.get_profile_args(args)

    # To contain all resources, and their variations, that samples are
    # matched to.  Start by adding pre-defined resources from
    # lexmapr.predefined_resources.
    # TODO: These pre-defined resources are the remnants of early
    #  LexMapr development.  We should eventually move to only adding
    #  terms from online ontologies to lookup tables.
    lookup_table = pipeline_resources.get_predefined_resources()

    # To contain resources fetched from online ontologies, if any.
    # Will eventually be added to ``lookup_table``.
    ontology_lookup_table = None

    if args.config:
        # Fetch online ontology terms specified in config file.
        ontology_lookup_table = pipeline_resources.get_config_resources(
            args.config, args.no_cache)
    elif args.profile:
        # Fetch online ontology terms specified in profile.
        ontology_lookup_table = pipeline_resources.get_profile_resources(
            args.profile)

    if ontology_lookup_table:
        # Merge ``ontology_lookup_table`` into ``lookup_table``
        lookup_table = helpers.merge_lookup_tables(lookup_table,
                                                   ontology_lookup_table)

    # To contain resources used in classification.
    classification_lookup_table = None
    if args.bucket:
        classification_lookup_table = pipeline_resources.get_classification_resources(
        )

    # Output file Column Headings
    output_fields = [
        "Sample_Id", "Sample_Desc", "Cleaned_Sample", "Matched_Components"
    ]

    if args.full:
        output_fields += [
            "Match_Status(Macro Level)", "Match_Status(Micro Level)"
        ]

    if args.bucket:
        if args.full:
            output_fields += [
                "LexMapr Classification (Full List)", "LexMapr Bucket",
                "Third Party Bucket", "Third Party Classification"
            ]
        else:
            output_fields += ["Third Party Classification"]

    fw = open(args.output,
              'w') if args.output else sys.stdout  # Main output file
    fw.write('\t'.join(output_fields))

    # Input file
    fr = open(args.input_file, "r")
    _, ext = os.path.splitext(args.input_file)
    if ext == ".csv":
        fr_reader = csv.reader(fr, delimiter=",")
    elif ext == ".tsv":
        fr_reader = csv.reader(fr, delimiter="\t")
    else:
        raise ValueError("Should not reach here")
    # Skip header
    next(fr_reader)

    # Iterate over samples for matching to ontology terms
    for row in fr_reader:
        sample_id = row[0].strip()
        original_sample = " ".join(row[1:]).strip()
        cleaned_sample = ""
        matched_components = []
        macro_status = "No Match"
        micro_status = []
        lexmapr_classification = []
        lexmapr_bucket = []
        third_party_bucket = []
        third_party_classification = []

        # Standardize sample to lowercase and with punctuation
        # treatment.
        sample = original_sample.lower()
        sample = helpers.punctuation_treatment(sample)

        sample_tokens = word_tokenize(sample)

        # Get ``cleaned_sample``
        for tkn in sample_tokens:
            # Ignore dates
            if helpers.is_date(tkn) or helpers.is_number(tkn):
                continue
            # Some preprocessing
            tkn = helpers.preprocess(tkn)

            lemma = helpers.singularize_token(tkn, lookup_table, micro_status)
            lemma = helpers.spelling_correction(lemma, lookup_table,
                                                micro_status)
            lemma = helpers.abbreviation_normalization_token(
                lemma, lookup_table, micro_status)
            lemma = helpers.non_English_normalization_token(
                lemma, lookup_table, micro_status)

            cleaned_sample = helpers.get_cleaned_sample(
                cleaned_sample, lemma, lookup_table)
            cleaned_sample = re.sub(' +', ' ', cleaned_sample)
            cleaned_sample = helpers.abbreviation_normalization_phrase(
                cleaned_sample, lookup_table, micro_status)
            cleaned_sample = helpers.non_English_normalization_phrase(
                cleaned_sample, lookup_table, micro_status)

        cleaned_sample = helpers.remove_duplicate_tokens(cleaned_sample)

        # Attempt full term match
        full_term_match = helpers.map_term(sample, lookup_table)

        if not full_term_match:
            # Attempt full term match with cleaned sample
            full_term_match = helpers.map_term(cleaned_sample, lookup_table)
            if full_term_match:
                micro_status.insert(0, "Used Cleaned Sample")

        if not full_term_match:
            # Attempt full term match using suffixes
            full_term_match = helpers.map_term(sample,
                                               lookup_table,
                                               consider_suffixes=True)

        if not full_term_match:
            # Attempt full term match with cleaned sample using suffixes
            full_term_match =\
                helpers.map_term(cleaned_sample, lookup_table, consider_suffixes=True)
            if full_term_match:
                micro_status.insert(0, "Used Cleaned Sample")

        if full_term_match:
            matched_components.append(full_term_match["term"] + ":" +
                                      full_term_match["id"])
            macro_status = "Full Term Match"
            micro_status += full_term_match["status"]

            if args.bucket:
                classification_result = classify_sample(
                    sample, matched_components, lookup_table,
                    classification_lookup_table)
                lexmapr_classification = classification_result[
                    "lexmapr_hierarchy_buckets"]
                lexmapr_bucket = classification_result["lexmapr_final_buckets"]
                third_party_bucket = classification_result[
                    "ifsac_final_buckets"]
                third_party_classification = classification_result[
                    "ifsac_final_labels"]
        else:
            # Attempt various component matches
            component_matches = []
            covered_tokens = set()

            for i in range(5, 0, -1):
                for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i):
                    concat_gram_chunk = " ".join(gram_chunk)
                    gram_tokens = word_tokenize(concat_gram_chunk)
                    gram_permutations =\
                        list(OrderedDict.fromkeys(permutations(concat_gram_chunk.split())))

                    # gram_tokens covered in prior component match
                    if set(gram_tokens) <= covered_tokens:
                        continue

                    for gram_permutation in gram_permutations:
                        gram_permutation_str = " ".join(gram_permutation)
                        component_match = helpers.map_term(
                            gram_permutation_str, lookup_table)

                        if not component_match:
                            # Try again with suffixes
                            component_match = helpers.map_term(
                                gram_permutation_str,
                                lookup_table,
                                consider_suffixes=True)

                        if component_match:
                            component_matches.append(component_match)
                            covered_tokens.update(gram_tokens)
                            break

            # We need should not consider component matches that are
            # ancestral to other component matches.
            ancestors = set()
            for component_match in component_matches:
                component_match_hierarchies =\
                    helpers.get_term_parent_hierarchies(component_match["id"], lookup_table)

                for component_match_hierarchy in component_match_hierarchies:
                    # We do not need the first element
                    component_match_hierarchy.pop(0)

                    ancestors |= set(component_match_hierarchy)

            for component_match in component_matches:
                if component_match["id"] not in ancestors:
                    matched_component = component_match[
                        "term"] + ":" + component_match["id"]
                    matched_components.append(matched_component)

            # TODO: revisit this step.
            # We do need it, but perhaps the function could be
            #  simplified?
            if len(matched_components):
                matched_components = helpers.retainedPhrase(matched_components)

            # Finalize micro_status
            # TODO: This is ugly, so revisit after revisiting
            #  ``retainedPhrase``.
            micro_status_covered_matches = set()
            for component_match in component_matches:
                possible_matched_component = component_match[
                    "term"] + ":" + component_match["id"]
                if possible_matched_component in matched_components:
                    if possible_matched_component not in micro_status_covered_matches:
                        micro_status_covered_matches.add(
                            possible_matched_component)
                        micro_status.append("{%s: %s}" %
                                            (component_match["term"],
                                             component_match["status"]))

            if matched_components:
                macro_status = "Component Match"

            if args.bucket:
                classification_result = classify_sample(
                    sample, matched_components, lookup_table,
                    classification_lookup_table)
                lexmapr_classification = classification_result[
                    "lexmapr_hierarchy_buckets"]
                lexmapr_bucket = classification_result["lexmapr_final_buckets"]
                third_party_bucket = classification_result[
                    "ifsac_final_buckets"]
                third_party_classification = classification_result[
                    "ifsac_final_labels"]

        # Write to row
        fw.write("\n" + sample_id + "\t" + original_sample + "\t" +
                 cleaned_sample + "\t" + str(matched_components))

        if args.full:
            fw.write("\t" + macro_status + "\t" + str(micro_status))

        if args.bucket:
            if args.full:
                fw.write("\t" + str(lexmapr_classification) + "\t" +
                         str(lexmapr_bucket) + "\t" + str(third_party_bucket))
            fw.write("\t" + str(sorted(third_party_classification)))

    fw.write('\n')
    # Output files closed
    if fw is not sys.stdout:
        fw.close()
    # Input file closed
    fr.close()
Ejemplo n.º 6
0
def refine_ifsac_final_labels(sample, ifsac_final_labels, label_refinements):
    """Gets refined final labels after application of customized rules.

    :param str sample: sample
    :param set ifsac_final_labels: the final labels set
    :param dict label_refinements: the dictionary of label refinement 
        resource
    :return set of refined final labels
    :rtype: set
    """

    # Caution: Rules are sequential - changing the order might change
    # results.
    ret = set(ifsac_final_labels)
    sample = helpers.punctuation_treatment(sample)
    sample_tokens = word_tokenize(sample)
    sample_tokens_set = set(sample_tokens)

    for label, refined_label in label_refinements.items():
        label_tokens = word_tokenize(label)
        if not (set(label_tokens) - set(sample_tokens)) or re.search(
                r"\b" + label + r"\b", sample):
            ret.add(refined_label)
            break

    # Defines different groups/ categories of classes
    specific_meat_categories = {
        "pork", "chicken", "beef", "fish", "game", "poultry", "turkey"
    }
    mollusk_categories = {"mollusks (non-bi-valve)", "mollusks (bi-valve)"}
    shellfish_categories = {"crustaceans", "mollusks"} | mollusk_categories
    aquatic_animal_categories = {"fish", "other aquatic animals"
                                 } | shellfish_categories
    poultry_categories = {"other poultry", "chicken", "turkey"}
    avian_categories = {"other poultry", "game", "poultry"
                        } | poultry_categories
    animal_categories = {
        "human", "companion animal", "aquatic animals", "wild animal", "beef",
        "pork", "other meat", "cow", "pig"
    }
    animal_categories |= avian_categories | aquatic_animal_categories | {
        "other animal"
    }
    veterinary_categories = avian_categories | aquatic_animal_categories | {
        "other animal"
    }
    veterinary_categories |= {
        "animal", "avian", "companion animal", "aquatic animals",
        "wild animal", "beef", "pork", "other meat", "cow", "pig"
    }
    environmental_categories = {
        "environmental-water", "environmental-farm",
        "environmental-restaurant", "environmental-retail",
        "environmental-abattoir", "environmental-warehouse",
        "environmental-researchfacility", "environmental-pasture",
        "environmental-animal housing",
        "environmental-factory/production facility", "environmental-vehicle",
        "environmental-construction"
    }
    root_underground_categories = {
        "root/underground (roots)", "root/underground (tubers)",
        "root/underground (bulbs)", "root/underground (other)"
    }
    seeded_vegetable_categories = {
        "seeded vegetables (vine-grown)", "seeded vegetables (solanaceous)",
        "seeded vegetables (legumes)", "seeded vegetables (other)"
    }
    vegetable_categories = {
        "fungi", "sprouts", "root/underground", "seeded vegetables", "herbs",
        "vegetable row crops (flower)", "vegetable row crops (stem)",
        "vegetable row crops (leafy)"
    }
    vegetable_categories |= root_underground_categories | seeded_vegetable_categories
    fruit_categories = {
        "melon fruit", "pome fruit", "stone fruit", "sub-tropical fruit",
        "small fruit", "tropical fruit"
    }
    plant_categories = {
        "oils", "vegetables", "fruits", "grains", "beans", "nuts", "seeds"
    }
    plant_categories |= vegetable_categories | fruit_categories
    other_plant_food_category = {
        "other (food additive)", "dietary supplement", "other (sweetener)",
        "other (flavoring and seasoning", "other (confectionary)"
    }
    other_animal_food_category = {"meat", "other meat", "beef", "pork"}

    # Customized rules for refinement of class labels
    # Deals with "animal feed" class
    if "animal feed" in ret and "by" in sample and "by product" not in sample:
        ret.remove("animal feed")

    # Deals with "clinical/research" class
    if "clinical/research" in ret \
            and ret.intersection(plant_categories | other_plant_food_category) \
            and not ("swab" in sample or "clinical" in sample):
        ret.remove("clinical/research")
    if "clinical/research" in ret and "swab sub" in sample:
        ret.clear()
        ret.add("environmental")
    if "clinical/research" in ret and "scat" in sample:
        ret.remove("clinical/research")
        ret.add("environmental")
    if "clinical/research" in ret and "environmental" in ret \
            and not ("tissue" in sample or "biological" in sample):
        ret.remove("clinical/research")
    if "clinical/research" in ret and ret.intersection(
            environmental_categories):
        ret.remove("clinical/research")
    if "clinical/research" in ret and (ret.intersection(plant_categories)
                                       or ret.intersection(animal_categories)):
        if "shell" in sample or "shell on" in sample or "shellon" in sample:
            ret.remove("clinical/research")
    if "clinical/research" in ret and ret.intersection(veterinary_categories):
        ret.remove("clinical/research")
        ret.add("veterinary clinical/research")
    if "veterinary clinical/research" in ret and "animal" in ret:
        ret.remove("animal")

    # Converts animal not defined to other animal, if not general
    # animal class.
    if "animal" in ret and sample != "animal":
        ret.remove("animal")
        ret.add("other animal")

    # Deals with "dairy", "cow" and "beef" cases
    if "dairy" in ret and "cow" in ret:
        ret.remove("cow")
    if "beef" in ret and "dairy" in ret and "milk" in sample:
        ret.remove("beef")
    beef_keywords = [
        "raw cow", "raw veal", "raw calf", "meat", "beef", "cow lung",
        "cow liver", "cow heart"
    ]
    for entry in beef_keywords:
        if entry in sample and "cow" in ret:
            ret.remove("cow")
            ret.add("beef")
    pork_keywords = ["raw pig", "raw swine", "meat", "pork", "porcine"]
    for entry in pork_keywords:
        if entry in sample and "pig" in ret:
            ret.remove("pig")
            ret.add("pork")
    if "cow" in ret and "beef" in ret:
        ret.remove("cow")
    if "beef" in ret and "fish" in ret and ("fillet" in sample
                                            or "filet" in sample):
        ret.remove("beef")
    if "beef" in ret and ("veterinary clinical/research" in ret):
        ret.remove("beef")
        ret.add("cow")
    if "oils" in ret and "in oil" in sample:
        ret.remove("oils")
    if "other (sweetener)" in ret and "sugar free" in sample:
        ret.remove("other (sweetener)")

    # Deals with "fish", "shellfish" and "eggs" cases
    if "shellfish" in ret and "fish" in ret:
        ret.remove("fish")
    if "fish" in ret and "eggs" in ret:
        ret.remove("eggs")
    if "fish eggs" in ret and "eggs" in ret:
        ret.remove("fish eggs")
    if "fish" in ret and "poultry" in ret:
        ret.remove("poultry")
    if "fish" in ret and "other poultry" in ret:
        ret.remove("other poultry")
    if "poultry" in ret and "eggs" in ret:
        ret.remove("poultry")

    # Deals with "pig", "pork" and "meat" cases
    if ("pork" in ret or "pork" in sample) and ("pig" in ret):
        ret.remove("pig")
        ret.add("pork")
    if ("pork" in ret or "pork" in sample) and ("meat" in ret):
        ret.remove("meat")
        ret.add("pork")
    if "pork" in ret and "veterinary clinical/research" in ret:
        ret.remove("pork")
        ret.add("pig")
    if "meat" in ret and ("veterinary clinical/research" in ret
                          or "engineering  seafood" in ret):
        ret.remove("meat")
    if ret.intersection(specific_meat_categories) and "meat" in ret:
        ret.remove("meat")

    # Deals with cases when clinical/research is there and meats are
    # there.
    if not ret.intersection(animal_categories) and "other meat" in ret \
            and ("veterinary clinical/research" in ret or "clinical/research" in ret):
        ret.remove("other meat")
        ret.add("other animal")
    if not ret.intersection(animal_categories) and "meat" in ret \
            and ("veterinary clinical/research" in ret or "clinical/research" in ret):
        ret.remove("meat")
        if "liver" not in sample:
            ret.add("other animal")
    if not ret.intersection(animal_categories) and (
            "veterinary clinical/research" in ret):
        ret.add("other animal")

    # Retains the specific (more granular) animal classes
    if "mollusks" in ret and ret.intersection(mollusk_categories):
        ret.remove("mollusks")
    if "shellfish" in ret and ret.intersection(shellfish_categories):
        ret.remove("shellfish")
    if "aquatic animals" in ret and ret.intersection(
            aquatic_animal_categories):
        ret.remove("aquatic animals")
    if "poultry" in ret and ret.intersection(poultry_categories):
        ret.remove("poultry")
    if "other animal" in ret and ret.intersection(avian_categories):
        ret.remove("other animal")
    if "animal" in ret and ret.intersection(animal_categories):
        ret.remove("animal")
    if "engineered seafood" in ret and ret.intersection(
            aquatic_animal_categories):
        ret = ret - ret.intersection(aquatic_animal_categories)
    if "engineered seafood" in ret and "aquatic animals" in ret:
        ret.remove("aquatic animals")
    if ("engineered seafood" in ret
            or "companion animal" in ret) and "other animal" in ret:
        ret.remove("other animal")

    # Retains the specific (more granular) plant classes
    if "root/underground" in ret and ret.intersection(
            root_underground_categories):
        ret.remove("root/underground")
    if "seeded vegetables" in ret and ret.intersection(
            seeded_vegetable_categories):
        ret.remove("seeded vegetables")
    if "vegetables" in ret and ret.intersection(vegetable_categories):
        ret.remove("vegetables")
    if "fruits" in ret and ret.intersection(fruit_categories):
        ret.remove("fruits")
    if "plant" in ret and ret.intersection(plant_categories):
        ret.remove("plant")

    # Deals with "nut", and "seeds", and "environment-water" and "fish"
    # case.
    if "nut" in ret and "seeds" in ret and len(ret) == 2:
        ret.remove("seeds")
    if "environment-water" in ret and "fish" in ret and len(ret) == 2:
        ret.remove("environment-water")

    # Retains the specific (more granular) environmental classes
    if "environmental" in ret and ret.intersection(environmental_categories):
        ret.remove("environmental")
    if ("environmental-animal housing" in ret or "environmental-abattoir" in ret
            or "environmental-farm" in ret) \
            and "environmental-factory/production facility" in ret:
        ret.remove("environmental-factory/production facility")
    if "environmental-abattoir" in ret and "environmental-factory/production facility" in ret:
        ret.remove("environmental-factory/production facility")
    exclusions = {
        'clinical/research', 'veterinary clinical/research', 'animal feed',
        'human', 'environmental'
    }

    # Assigns multi-ingredient to the cases where multiple food
    # ingredients have been tagged.
    if not (ret.intersection(exclusions) or ret.intersection(environmental_categories)) \
            and len(ret) >= 3:
        ret.add(
            "multi-ingredient")  # To be revisted and revised as per evaluation

    # Deals with some specific cases
    if "other meat" in ret and "other animal" in ret:
        ret.remove("other animal")
    if "meat" in ret and ret.intersection(animal_categories):
        if len(ret) == 3 and "multi-ingredient" in ret:
            ret.remove("multi-ingredient")
            ret.remove("meat")
        else:
            ret.remove("meat")

    # Retains the specific (more granular) classes and removing the
    # general "food" class.
    if "food" in ret and ret.intersection(animal_categories | plant_categories
                                          | other_animal_food_category
                                          | other_plant_food_category
                                          | {"plant", "animal"}):
        ret.remove("food")
    if "food" in ret and ("dairy" in ret or "environmental" in ret
                          or "clinical/research" in ret
                          or "veterinary clinical/research" in ret):
        ret.remove("food")

    # Deals with addtional/unique cases
    if "food" in ret and "environmental" in ret and "leaf" in sample:
        ret.remove("environmental")
    if "environmental-animal housing" in ret and "finished" in sample:
        ret.remove("environmental-animal housing")
    if ("chicken" in ret or "poultry" in ret or "other poultry" in ret or "cow" in ret) \
            and "environmental-factory/production facility" in ret:
        ret.remove("environmental-factory/production facility")
        ret.add("environmental-farm")
    if "eggs" in ret and "veterinary clinical/research" in ret:
        ret.remove("veterinary clinical/research")
    if "environmental" in ret \
            and ("multi-ingredient" in ret or ret.intersection(plant_categories)) \
            and not ("swab" in sample or "environmental" in sample):
        ret.remove("environmental")

    # Deals with body parts that are food for specific animal
    # categories and not clinical/research.
    food_anatomical_parts = {
        'heart', 'liver', 'lung', 'leg', 'shell-on', 'shell', 'soft shell',
        'tail', 'hlso', 'shellon', 'beef', 'pork', 'meat', 'porcine',
        'shell on'
    }
    body_part_for_food_animal_categories = \
        aquatic_animal_categories | shellfish_categories | poultry_categories | {"cow"}
    if "veterinary clinical/research" in ret \
            and ret.intersection(body_part_for_food_animal_categories) \
            and sample_tokens_set.intersection(food_anatomical_parts) and "swab" not in sample:
        ret.remove("veterinary clinical/research")

    # Deals with very specific disambiguation tokens
    disambiguation_words = {
        'ground', 'scraps', 'cut', 'smoke', 'moon', 'plain'
    }
    if "environmental" in ret \
            and (ret.intersection(animal_categories) or ret.intersection(plant_categories)
                 or "dairy" in ret) \
            and sample_tokens_set.intersection(disambiguation_words):
        ret.remove("environmental")

    # Retains the general class (only animal feed)
    if "animal feed" in ret:
        ret.clear()
        ret.add("animal feed")

    # Deals with multi-ingredient case
    if ("multi-ingredient" in ret
            or "food supplement" in ret) and "food" in ret:
        ret.remove("food")
    if "food" in ret and len(ret) < 2:
        ret.remove("food")
        ret.add("multi-ingredient")

    return list(ret)