def add_predefined_resources_to_lookup_table(lookup_table): """Add elements from lexmapr.predefined_resources to lookup table. :param lookup_table: See create_lookup_table_skeleton for the expected format of this parameter :type lookup_table: dict :return: Modified ``lookup_table`` :rtype: dict """ # Abbreviations of resource terms lookup_table["abbreviations"] = get_resource_dict("AbbLex.csv") # Non-english translations of resource terms lookup_table["non_english_words"] = get_resource_dict("NefLex.csv") # Common misspellings of resource terms lookup_table["spelling_mistakes"] = get_resource_dict("ScorLex.csv") # Terms excluded from inflection treatment lookup_table["inflection_exceptions"] = get_resource_dict( "inflection-exceptions.csv") # Constrained list of stop words considered to be meaningless lookup_table["stop_words"] = get_resource_dict("mining-stopwords.csv") # Suffixes to consider appending to terms when mining ontologies lookup_table["suffixes"] = get_resource_dict("suffixes.csv") lookup_table["synonyms"] = get_resource_dict("SynLex.csv") lookup_table["synonyms"] = { punctuation_treatment(k): punctuation_treatment(v) for k, v in lookup_table["synonyms"].items() } lookup_table["non_standard_resource_ids"] = get_resource_dict( "CombinedResourceTerms.csv") lookup_table["standard_resource_labels"] = { punctuation_treatment(v): k for k, v in lookup_table["non_standard_resource_ids"].items() } for label in lookup_table["standard_resource_labels"]: resource_id = lookup_table["standard_resource_labels"][label] label_tokens = word_tokenize(label) # To limit performance overhead, we ignore resource labels with # more than 7 tokens, as permutating too many tokens can be # costly. We also ignore NCBI taxon terms, as there are # ~160000 such terms. if len(label_tokens) < 7 and "ncbitaxon" not in resource_id: label_permutations = get_resource_label_permutations(label) for permutation in label_permutations: lookup_table["standard_resource_label_permutations"][ permutation] = resource_id return lookup_table
def test_punctuationTreatment(self): """Tests punctuationTreatment.""" # Empty input string self.assertEqual(pipeline_helpers.punctuation_treatment(""), "") # Single-token input string with no punctuation self.assertEqual(pipeline_helpers.punctuation_treatment("foo"), "foo") # Multi-token input string with no punctuation self.assertEqual(pipeline_helpers.punctuation_treatment("foo bar"), "foo bar") # Single-token input string with punctuation self.assertEqual(pipeline_helpers.punctuation_treatment("_foo-bar_"), "foo bar") # Multi-token input string with punctuation self.assertEqual(pipeline_helpers.punctuation_treatment("_foo;ba r_"), "foo ba r") # Multi-token input string with number and punctuation self.assertEqual(pipeline_helpers.punctuation_treatment("a-b -1"), "a b 1")
def add_fetched_ontology_to_lookup_table(lookup_table, fetched_ontology): """Add terms from fetched_ontology to lookup_table. lookup_table can be used to map terms in run. See create_lookup_table_skeleton for the expected format of lookup_table. :param lookup_table: See create_lookup_table_skeleton for the expected format of this parameter :param fetched_ontology: See JSON output of ontofetch.py for the expected format of this parameter :type lookup_table: dict :type fetched_ontology: dict :return: Modified ``lookup_table`` :rtype: dict """ # Parse content from fetched_ontology and add it to lookup_table for resource in fetched_ontology["specifications"].values(): if "id" in resource and "label" in resource: resource_id = resource["id"].lower() resource_label = resource["label"].lower() # ID value should match format of pre-defined resources resource_id = resource_id.replace(":", "_") lookup_table["non_standard_resource_ids"][ resource_id] = resource_label # Standardize label resource_label = punctuation_treatment(resource_label) lookup_table["standard_resource_labels"][ resource_label] = resource_id # List of tokens in resource_label resource_tokens = word_tokenize(resource_label) # Add permutations if there are less than seven tokens. # Permutating more tokens than this can lead to performance # issues. if len(resource_tokens) < 7: permutations = get_resource_label_permutations(resource_label) for permutation in permutations: lookup_table["standard_resource_label_permutations"][ permutation] = resource_id if "oboInOwl:hasSynonym" in resource: synonyms = resource["oboInOwl:hasSynonym"] for synonym in synonyms: # Standardize synonym synonym = punctuation_treatment(synonym.lower()) lookup_table["synonyms"][synonym] = resource_label if "oboInOwl:hasNarrowSynonym" in resource: synonyms = resource["oboInOwl:hasNarrowSynonym"] for synonym in synonyms: # Standardize synonym synonym = punctuation_treatment(synonym.lower()) lookup_table["synonyms"][synonym] = resource_label if "oboInOwl:hasExactSynonym" in resource: synonyms = resource["oboInOwl:hasExactSynonym"] for synonym in synonyms: # Standardize synonym synonym = punctuation_treatment(synonym.lower()) lookup_table["synonyms"][synonym] = resource_label if "parent_id" in resource: # Standardize parent_id parent_id = resource["parent_id"].replace(":", "_") parent_id = parent_id.lower() # Bug in ``ontofetch.py``--sometimes a resource is # parent to itself. Remove when fixed. if resource_id == parent_id: continue # Instead of overwriting parents like we do with # synonyms, we will concatenate parents from different # fetches. elif resource_id in lookup_table["parents"]: # Prevent duplicates if parent_id not in lookup_table["parents"][resource_id]: lookup_table["parents"][resource_id] += [parent_id] else: lookup_table["parents"][resource_id] = [parent_id] if "other_parents" in resource: # Standardize values other_parents = list( map(lambda x: x.replace(":", "_").lower(), resource["other_parents"])) # Prevent duplicates other_parents = list( filter( lambda x: x not in lookup_table["parents"][ resource_id], other_parents)) # Bug in ``ontofetch.py``--sometimes a resource is # parent to itself. Remove when fixed. other_parents = list( filter(lambda x: x != resource_id, other_parents)) lookup_table["parents"][resource_id] += other_parents return lookup_table
def find_match(map_result, ontology_lookup_table): mapping_output = map_result["mapping_output"] input_to_ontology_mapping = map_result["input_to_ontology_mapping"] ontology_to_input_mapping = map_result["ontology_to_input_mapping"] input_term_label_map = map_result["input_term_label"] ontology_term_label_map = map_result["ontology_term_label"] for input_term, mapping_object in mapping_output.items(): input_term_id = get_term_id(input_term) input_term_label = get_term_label(input_term) # Standardize sample to lowercase and with punctuation treatment. sample = input_term_label.lower() sample = helpers.punctuation_treatment(sample) # Tokenize sample and remove stop words and 1-letter words sample_tokens = word_tokenize(sample) # Get "cleaned_sample" cleaned_sample = "" for token in sample_tokens: # Ignore dates if helpers.is_date(token) or helpers.is_number(token): continue # Ignore single letter if helpers.is_single_letter(token): continue # Some preprocessing token = helpers.preprocess(token) lemma = helpers.singularize_token( token, ontology_lookup_table, []) lemma = helpers.spelling_correction( lemma, ontology_lookup_table, []) lemma = helpers.abbreviation_normalization_token( lemma, ontology_lookup_table, []) lemma = helpers.non_English_normalization_token( lemma, ontology_lookup_table, []) cleaned_sample = helpers.get_cleaned_sample( cleaned_sample, lemma, ontology_lookup_table) cleaned_sample = re.sub(' +', ' ', cleaned_sample) cleaned_sample = helpers.abbreviation_normalization_phrase( cleaned_sample, ontology_lookup_table, []) cleaned_sample = helpers.non_English_normalization_phrase( cleaned_sample, ontology_lookup_table, []) cleaned_sample = helpers.remove_duplicate_tokens(cleaned_sample) # Attempt full term match full_term_match = helpers.map_term(sample, ontology_lookup_table) if not full_term_match: # Attempt full term match with cleaned sample full_term_match =\ helpers.map_term(cleaned_sample, ontology_lookup_table) if not full_term_match: # Attempt full term match using suffixes full_term_match =\ helpers.map_term(sample, ontology_lookup_table, consider_suffixes=True) if not full_term_match: # Attempt full term match with cleaned sample using suffixes full_term_match =\ helpers.map_term(cleaned_sample, ontology_lookup_table, consider_suffixes=True) if full_term_match: ontology_term_id = full_term_match["id"].upper() ontology_term_label = full_term_match["term"] ontology_term = "%s:%s" % (ontology_term_label, ontology_term_id) mapping_output[input_term] = ontology_term input_to_ontology_mapping[input_term_id] = ontology_term_id ontology_to_input_mapping[ontology_term_id] = input_term_id ontology_term_label_map[ontology_term_id] = full_term_match["term"] input_term_label_map[input_term_id] = input_term_label else: # Attempt various component matches component_matches = [] covered_tokens = set() for i in range(5, 0, -1): for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i): concat_gram_chunk = " ".join(gram_chunk) gram_tokens = word_tokenize(concat_gram_chunk) gram_permutations =\ list(OrderedDict.fromkeys(permutations( concat_gram_chunk.split()))) # gram_tokens covered in prior component match if set(gram_tokens) <= covered_tokens: continue for gram_permutation in gram_permutations: gram_permutation_str = " ".join(gram_permutation) component_match =\ helpers.map_term(gram_permutation_str, ontology_lookup_table) if not component_match: # Try again with suffixes component_match =\ helpers.map_term(gram_permutation_str, ontology_lookup_table, consider_suffixes=True) if component_match: component_matches.append(component_match) covered_tokens.update(gram_tokens) break # We need should not consider component matches that are # ancestral to other component matches. ancestors = set() for component_match in component_matches: component_match_hierarchies =\ helpers.get_term_parent_hierarchies(component_match["id"], ontology_lookup_table) for component_match_hierarchy in component_match_hierarchies: # We do not need the first element component_match_hierarchy.pop(0) ancestors |= set(component_match_hierarchy) matched_components = [] for component_match in component_matches: if component_match["id"] not in ancestors: matched_component =\ "%s:%s" % (component_match["term"], component_match["id"]) matched_components.append(matched_component) # TODO: revisit this step. # We do need it, but perhaps the function could be # simplified? if len(matched_components): matched_components = helpers.retain_phrase(matched_components) if matched_components: if len(matched_components) == 1: single_value = matched_components[0] onto_term_id = get_term_id(single_value) onto_term_label = get_term_label(single_value) mapping_output[input_term] = single_value input_to_ontology_mapping[input_term_id] = onto_term_id if onto_term_id not in ontology_to_input_mapping: ontology_to_input_mapping[onto_term_id] = input_term_id ontology_term_label_map[onto_term_id] = onto_term_label input_term_label_map[input_term_id] = input_term_label else: mapping_output[input_term] = matched_components input_to_ontology_mapping[input_term_id] =\ [onto_term_id for onto_term_id in map( lambda s: get_term_id(s), matched_components)] input_term_label_map[input_term_id] = input_term_label for ontology_term in matched_components: onto_term_id = get_term_id(ontology_term) onto_term_label = get_term_label(ontology_term) ontology_term_label_map[onto_term_id] = onto_term_label return map_result
def run(args): """ Main text mining pipeline. """ # If the user specified a profile, we must retrieve args specified # by the profile, unless they were explicitly overridden. if args.profile: args = pipeline_resources.get_profile_args(args) # To contain all resources, and their variations, that samples are # matched to. Start by adding pre-defined resources from # lexmapr.predefined_resources. # TODO: These pre-defined resources are the remnants of early # LexMapr development. We should eventually move to only adding # terms from online ontologies to lookup tables. lookup_table = pipeline_resources.get_predefined_resources() # To contain resources fetched from online ontologies, if any. # Will eventually be added to ``lookup_table``. ontology_lookup_table = None if args.config: # Fetch online ontology terms specified in config file. ontology_lookup_table = pipeline_resources.get_config_resources( args.config, args.no_cache) elif args.profile: # Fetch online ontology terms specified in profile. ontology_lookup_table = pipeline_resources.get_profile_resources( args.profile) if ontology_lookup_table: # Merge ``ontology_lookup_table`` into ``lookup_table`` lookup_table = helpers.merge_lookup_tables(lookup_table, ontology_lookup_table) # To contain resources used in classification. classification_lookup_table = None if args.bucket: classification_lookup_table = pipeline_resources.get_classification_resources( ) # Output file Column Headings output_fields = [ "Sample_Id", "Sample_Desc", "Cleaned_Sample", "Matched_Components" ] if args.full: output_fields += [ "Match_Status(Macro Level)", "Match_Status(Micro Level)" ] if args.bucket: if args.full: output_fields += [ "LexMapr Classification (Full List)", "LexMapr Bucket", "Third Party Bucket", "Third Party Classification" ] else: output_fields += ["Third Party Classification"] fw = open(args.output, 'w') if args.output else sys.stdout # Main output file fw.write('\t'.join(output_fields)) # Input file fr = open(args.input_file, "r") _, ext = os.path.splitext(args.input_file) if ext == ".csv": fr_reader = csv.reader(fr, delimiter=",") elif ext == ".tsv": fr_reader = csv.reader(fr, delimiter="\t") else: raise ValueError("Should not reach here") # Skip header next(fr_reader) # Iterate over samples for matching to ontology terms for row in fr_reader: sample_id = row[0].strip() original_sample = " ".join(row[1:]).strip() cleaned_sample = "" matched_components = [] macro_status = "No Match" micro_status = [] lexmapr_classification = [] lexmapr_bucket = [] third_party_bucket = [] third_party_classification = [] # Standardize sample to lowercase and with punctuation # treatment. sample = original_sample.lower() sample = helpers.punctuation_treatment(sample) sample_tokens = word_tokenize(sample) # Get ``cleaned_sample`` for tkn in sample_tokens: # Ignore dates if helpers.is_date(tkn) or helpers.is_number(tkn): continue # Some preprocessing tkn = helpers.preprocess(tkn) lemma = helpers.singularize_token(tkn, lookup_table, micro_status) lemma = helpers.spelling_correction(lemma, lookup_table, micro_status) lemma = helpers.abbreviation_normalization_token( lemma, lookup_table, micro_status) lemma = helpers.non_English_normalization_token( lemma, lookup_table, micro_status) cleaned_sample = helpers.get_cleaned_sample( cleaned_sample, lemma, lookup_table) cleaned_sample = re.sub(' +', ' ', cleaned_sample) cleaned_sample = helpers.abbreviation_normalization_phrase( cleaned_sample, lookup_table, micro_status) cleaned_sample = helpers.non_English_normalization_phrase( cleaned_sample, lookup_table, micro_status) cleaned_sample = helpers.remove_duplicate_tokens(cleaned_sample) # Attempt full term match full_term_match = helpers.map_term(sample, lookup_table) if not full_term_match: # Attempt full term match with cleaned sample full_term_match = helpers.map_term(cleaned_sample, lookup_table) if full_term_match: micro_status.insert(0, "Used Cleaned Sample") if not full_term_match: # Attempt full term match using suffixes full_term_match = helpers.map_term(sample, lookup_table, consider_suffixes=True) if not full_term_match: # Attempt full term match with cleaned sample using suffixes full_term_match =\ helpers.map_term(cleaned_sample, lookup_table, consider_suffixes=True) if full_term_match: micro_status.insert(0, "Used Cleaned Sample") if full_term_match: matched_components.append(full_term_match["term"] + ":" + full_term_match["id"]) macro_status = "Full Term Match" micro_status += full_term_match["status"] if args.bucket: classification_result = classify_sample( sample, matched_components, lookup_table, classification_lookup_table) lexmapr_classification = classification_result[ "lexmapr_hierarchy_buckets"] lexmapr_bucket = classification_result["lexmapr_final_buckets"] third_party_bucket = classification_result[ "ifsac_final_buckets"] third_party_classification = classification_result[ "ifsac_final_labels"] else: # Attempt various component matches component_matches = [] covered_tokens = set() for i in range(5, 0, -1): for gram_chunk in helpers.get_gram_chunks(cleaned_sample, i): concat_gram_chunk = " ".join(gram_chunk) gram_tokens = word_tokenize(concat_gram_chunk) gram_permutations =\ list(OrderedDict.fromkeys(permutations(concat_gram_chunk.split()))) # gram_tokens covered in prior component match if set(gram_tokens) <= covered_tokens: continue for gram_permutation in gram_permutations: gram_permutation_str = " ".join(gram_permutation) component_match = helpers.map_term( gram_permutation_str, lookup_table) if not component_match: # Try again with suffixes component_match = helpers.map_term( gram_permutation_str, lookup_table, consider_suffixes=True) if component_match: component_matches.append(component_match) covered_tokens.update(gram_tokens) break # We need should not consider component matches that are # ancestral to other component matches. ancestors = set() for component_match in component_matches: component_match_hierarchies =\ helpers.get_term_parent_hierarchies(component_match["id"], lookup_table) for component_match_hierarchy in component_match_hierarchies: # We do not need the first element component_match_hierarchy.pop(0) ancestors |= set(component_match_hierarchy) for component_match in component_matches: if component_match["id"] not in ancestors: matched_component = component_match[ "term"] + ":" + component_match["id"] matched_components.append(matched_component) # TODO: revisit this step. # We do need it, but perhaps the function could be # simplified? if len(matched_components): matched_components = helpers.retainedPhrase(matched_components) # Finalize micro_status # TODO: This is ugly, so revisit after revisiting # ``retainedPhrase``. micro_status_covered_matches = set() for component_match in component_matches: possible_matched_component = component_match[ "term"] + ":" + component_match["id"] if possible_matched_component in matched_components: if possible_matched_component not in micro_status_covered_matches: micro_status_covered_matches.add( possible_matched_component) micro_status.append("{%s: %s}" % (component_match["term"], component_match["status"])) if matched_components: macro_status = "Component Match" if args.bucket: classification_result = classify_sample( sample, matched_components, lookup_table, classification_lookup_table) lexmapr_classification = classification_result[ "lexmapr_hierarchy_buckets"] lexmapr_bucket = classification_result["lexmapr_final_buckets"] third_party_bucket = classification_result[ "ifsac_final_buckets"] third_party_classification = classification_result[ "ifsac_final_labels"] # Write to row fw.write("\n" + sample_id + "\t" + original_sample + "\t" + cleaned_sample + "\t" + str(matched_components)) if args.full: fw.write("\t" + macro_status + "\t" + str(micro_status)) if args.bucket: if args.full: fw.write("\t" + str(lexmapr_classification) + "\t" + str(lexmapr_bucket) + "\t" + str(third_party_bucket)) fw.write("\t" + str(sorted(third_party_classification))) fw.write('\n') # Output files closed if fw is not sys.stdout: fw.close() # Input file closed fr.close()
def refine_ifsac_final_labels(sample, ifsac_final_labels, label_refinements): """Gets refined final labels after application of customized rules. :param str sample: sample :param set ifsac_final_labels: the final labels set :param dict label_refinements: the dictionary of label refinement resource :return set of refined final labels :rtype: set """ # Caution: Rules are sequential - changing the order might change # results. ret = set(ifsac_final_labels) sample = helpers.punctuation_treatment(sample) sample_tokens = word_tokenize(sample) sample_tokens_set = set(sample_tokens) for label, refined_label in label_refinements.items(): label_tokens = word_tokenize(label) if not (set(label_tokens) - set(sample_tokens)) or re.search( r"\b" + label + r"\b", sample): ret.add(refined_label) break # Defines different groups/ categories of classes specific_meat_categories = { "pork", "chicken", "beef", "fish", "game", "poultry", "turkey" } mollusk_categories = {"mollusks (non-bi-valve)", "mollusks (bi-valve)"} shellfish_categories = {"crustaceans", "mollusks"} | mollusk_categories aquatic_animal_categories = {"fish", "other aquatic animals" } | shellfish_categories poultry_categories = {"other poultry", "chicken", "turkey"} avian_categories = {"other poultry", "game", "poultry" } | poultry_categories animal_categories = { "human", "companion animal", "aquatic animals", "wild animal", "beef", "pork", "other meat", "cow", "pig" } animal_categories |= avian_categories | aquatic_animal_categories | { "other animal" } veterinary_categories = avian_categories | aquatic_animal_categories | { "other animal" } veterinary_categories |= { "animal", "avian", "companion animal", "aquatic animals", "wild animal", "beef", "pork", "other meat", "cow", "pig" } environmental_categories = { "environmental-water", "environmental-farm", "environmental-restaurant", "environmental-retail", "environmental-abattoir", "environmental-warehouse", "environmental-researchfacility", "environmental-pasture", "environmental-animal housing", "environmental-factory/production facility", "environmental-vehicle", "environmental-construction" } root_underground_categories = { "root/underground (roots)", "root/underground (tubers)", "root/underground (bulbs)", "root/underground (other)" } seeded_vegetable_categories = { "seeded vegetables (vine-grown)", "seeded vegetables (solanaceous)", "seeded vegetables (legumes)", "seeded vegetables (other)" } vegetable_categories = { "fungi", "sprouts", "root/underground", "seeded vegetables", "herbs", "vegetable row crops (flower)", "vegetable row crops (stem)", "vegetable row crops (leafy)" } vegetable_categories |= root_underground_categories | seeded_vegetable_categories fruit_categories = { "melon fruit", "pome fruit", "stone fruit", "sub-tropical fruit", "small fruit", "tropical fruit" } plant_categories = { "oils", "vegetables", "fruits", "grains", "beans", "nuts", "seeds" } plant_categories |= vegetable_categories | fruit_categories other_plant_food_category = { "other (food additive)", "dietary supplement", "other (sweetener)", "other (flavoring and seasoning", "other (confectionary)" } other_animal_food_category = {"meat", "other meat", "beef", "pork"} # Customized rules for refinement of class labels # Deals with "animal feed" class if "animal feed" in ret and "by" in sample and "by product" not in sample: ret.remove("animal feed") # Deals with "clinical/research" class if "clinical/research" in ret \ and ret.intersection(plant_categories | other_plant_food_category) \ and not ("swab" in sample or "clinical" in sample): ret.remove("clinical/research") if "clinical/research" in ret and "swab sub" in sample: ret.clear() ret.add("environmental") if "clinical/research" in ret and "scat" in sample: ret.remove("clinical/research") ret.add("environmental") if "clinical/research" in ret and "environmental" in ret \ and not ("tissue" in sample or "biological" in sample): ret.remove("clinical/research") if "clinical/research" in ret and ret.intersection( environmental_categories): ret.remove("clinical/research") if "clinical/research" in ret and (ret.intersection(plant_categories) or ret.intersection(animal_categories)): if "shell" in sample or "shell on" in sample or "shellon" in sample: ret.remove("clinical/research") if "clinical/research" in ret and ret.intersection(veterinary_categories): ret.remove("clinical/research") ret.add("veterinary clinical/research") if "veterinary clinical/research" in ret and "animal" in ret: ret.remove("animal") # Converts animal not defined to other animal, if not general # animal class. if "animal" in ret and sample != "animal": ret.remove("animal") ret.add("other animal") # Deals with "dairy", "cow" and "beef" cases if "dairy" in ret and "cow" in ret: ret.remove("cow") if "beef" in ret and "dairy" in ret and "milk" in sample: ret.remove("beef") beef_keywords = [ "raw cow", "raw veal", "raw calf", "meat", "beef", "cow lung", "cow liver", "cow heart" ] for entry in beef_keywords: if entry in sample and "cow" in ret: ret.remove("cow") ret.add("beef") pork_keywords = ["raw pig", "raw swine", "meat", "pork", "porcine"] for entry in pork_keywords: if entry in sample and "pig" in ret: ret.remove("pig") ret.add("pork") if "cow" in ret and "beef" in ret: ret.remove("cow") if "beef" in ret and "fish" in ret and ("fillet" in sample or "filet" in sample): ret.remove("beef") if "beef" in ret and ("veterinary clinical/research" in ret): ret.remove("beef") ret.add("cow") if "oils" in ret and "in oil" in sample: ret.remove("oils") if "other (sweetener)" in ret and "sugar free" in sample: ret.remove("other (sweetener)") # Deals with "fish", "shellfish" and "eggs" cases if "shellfish" in ret and "fish" in ret: ret.remove("fish") if "fish" in ret and "eggs" in ret: ret.remove("eggs") if "fish eggs" in ret and "eggs" in ret: ret.remove("fish eggs") if "fish" in ret and "poultry" in ret: ret.remove("poultry") if "fish" in ret and "other poultry" in ret: ret.remove("other poultry") if "poultry" in ret and "eggs" in ret: ret.remove("poultry") # Deals with "pig", "pork" and "meat" cases if ("pork" in ret or "pork" in sample) and ("pig" in ret): ret.remove("pig") ret.add("pork") if ("pork" in ret or "pork" in sample) and ("meat" in ret): ret.remove("meat") ret.add("pork") if "pork" in ret and "veterinary clinical/research" in ret: ret.remove("pork") ret.add("pig") if "meat" in ret and ("veterinary clinical/research" in ret or "engineering seafood" in ret): ret.remove("meat") if ret.intersection(specific_meat_categories) and "meat" in ret: ret.remove("meat") # Deals with cases when clinical/research is there and meats are # there. if not ret.intersection(animal_categories) and "other meat" in ret \ and ("veterinary clinical/research" in ret or "clinical/research" in ret): ret.remove("other meat") ret.add("other animal") if not ret.intersection(animal_categories) and "meat" in ret \ and ("veterinary clinical/research" in ret or "clinical/research" in ret): ret.remove("meat") if "liver" not in sample: ret.add("other animal") if not ret.intersection(animal_categories) and ( "veterinary clinical/research" in ret): ret.add("other animal") # Retains the specific (more granular) animal classes if "mollusks" in ret and ret.intersection(mollusk_categories): ret.remove("mollusks") if "shellfish" in ret and ret.intersection(shellfish_categories): ret.remove("shellfish") if "aquatic animals" in ret and ret.intersection( aquatic_animal_categories): ret.remove("aquatic animals") if "poultry" in ret and ret.intersection(poultry_categories): ret.remove("poultry") if "other animal" in ret and ret.intersection(avian_categories): ret.remove("other animal") if "animal" in ret and ret.intersection(animal_categories): ret.remove("animal") if "engineered seafood" in ret and ret.intersection( aquatic_animal_categories): ret = ret - ret.intersection(aquatic_animal_categories) if "engineered seafood" in ret and "aquatic animals" in ret: ret.remove("aquatic animals") if ("engineered seafood" in ret or "companion animal" in ret) and "other animal" in ret: ret.remove("other animal") # Retains the specific (more granular) plant classes if "root/underground" in ret and ret.intersection( root_underground_categories): ret.remove("root/underground") if "seeded vegetables" in ret and ret.intersection( seeded_vegetable_categories): ret.remove("seeded vegetables") if "vegetables" in ret and ret.intersection(vegetable_categories): ret.remove("vegetables") if "fruits" in ret and ret.intersection(fruit_categories): ret.remove("fruits") if "plant" in ret and ret.intersection(plant_categories): ret.remove("plant") # Deals with "nut", and "seeds", and "environment-water" and "fish" # case. if "nut" in ret and "seeds" in ret and len(ret) == 2: ret.remove("seeds") if "environment-water" in ret and "fish" in ret and len(ret) == 2: ret.remove("environment-water") # Retains the specific (more granular) environmental classes if "environmental" in ret and ret.intersection(environmental_categories): ret.remove("environmental") if ("environmental-animal housing" in ret or "environmental-abattoir" in ret or "environmental-farm" in ret) \ and "environmental-factory/production facility" in ret: ret.remove("environmental-factory/production facility") if "environmental-abattoir" in ret and "environmental-factory/production facility" in ret: ret.remove("environmental-factory/production facility") exclusions = { 'clinical/research', 'veterinary clinical/research', 'animal feed', 'human', 'environmental' } # Assigns multi-ingredient to the cases where multiple food # ingredients have been tagged. if not (ret.intersection(exclusions) or ret.intersection(environmental_categories)) \ and len(ret) >= 3: ret.add( "multi-ingredient") # To be revisted and revised as per evaluation # Deals with some specific cases if "other meat" in ret and "other animal" in ret: ret.remove("other animal") if "meat" in ret and ret.intersection(animal_categories): if len(ret) == 3 and "multi-ingredient" in ret: ret.remove("multi-ingredient") ret.remove("meat") else: ret.remove("meat") # Retains the specific (more granular) classes and removing the # general "food" class. if "food" in ret and ret.intersection(animal_categories | plant_categories | other_animal_food_category | other_plant_food_category | {"plant", "animal"}): ret.remove("food") if "food" in ret and ("dairy" in ret or "environmental" in ret or "clinical/research" in ret or "veterinary clinical/research" in ret): ret.remove("food") # Deals with addtional/unique cases if "food" in ret and "environmental" in ret and "leaf" in sample: ret.remove("environmental") if "environmental-animal housing" in ret and "finished" in sample: ret.remove("environmental-animal housing") if ("chicken" in ret or "poultry" in ret or "other poultry" in ret or "cow" in ret) \ and "environmental-factory/production facility" in ret: ret.remove("environmental-factory/production facility") ret.add("environmental-farm") if "eggs" in ret and "veterinary clinical/research" in ret: ret.remove("veterinary clinical/research") if "environmental" in ret \ and ("multi-ingredient" in ret or ret.intersection(plant_categories)) \ and not ("swab" in sample or "environmental" in sample): ret.remove("environmental") # Deals with body parts that are food for specific animal # categories and not clinical/research. food_anatomical_parts = { 'heart', 'liver', 'lung', 'leg', 'shell-on', 'shell', 'soft shell', 'tail', 'hlso', 'shellon', 'beef', 'pork', 'meat', 'porcine', 'shell on' } body_part_for_food_animal_categories = \ aquatic_animal_categories | shellfish_categories | poultry_categories | {"cow"} if "veterinary clinical/research" in ret \ and ret.intersection(body_part_for_food_animal_categories) \ and sample_tokens_set.intersection(food_anatomical_parts) and "swab" not in sample: ret.remove("veterinary clinical/research") # Deals with very specific disambiguation tokens disambiguation_words = { 'ground', 'scraps', 'cut', 'smoke', 'moon', 'plain' } if "environmental" in ret \ and (ret.intersection(animal_categories) or ret.intersection(plant_categories) or "dairy" in ret) \ and sample_tokens_set.intersection(disambiguation_words): ret.remove("environmental") # Retains the general class (only animal feed) if "animal feed" in ret: ret.clear() ret.add("animal feed") # Deals with multi-ingredient case if ("multi-ingredient" in ret or "food supplement" in ret) and "food" in ret: ret.remove("food") if "food" in ret and len(ret) < 2: ret.remove("food") ret.add("multi-ingredient") return list(ret)