def test_whitespace_tokenize_no_stemming(): """ Test whitespace tokenization on a standard sentence """ expected_result = ['The', 'man,', 'Ivan', 'Ivanovich,', 'did', 'not', 'know', 'the', "cat's", 'eye', 'color.'] assert tokenizer.whitespace_tokenize(PHRASE_1, None) == expected_result
def ingest_output_data(output_file, stemmer, tokenizer_str, stopword_str): """ ingests the output data """ with open(output_file, 'r', encoding='utf-8') as fs: lines = fs.readlines() for line in lines: parts = line.split('\t') condition = parts[0].lower() related_data = parts[0:] output_dict[condition] = related_data if 'whitespace' == tokenizer_str: tokens = tokenizer.whitespace_tokenize( condition.replace('(', '').replace(')', '').replace(':', '').replace(',', ''), stemmer) elif 'nltk' == tokenizer_str: tokens = tokenizer.nltk_tokenize(condition) else: tokens = [] if 'aggressive' == stopword_str: output_token_dict[condition] = stopword.remove_agressive_stopwords( tokens) elif 'nltk' == stopword_str: output_token_dict[condition] = stopword.remove_nltk_stopwords( tokens)
def test_apply_nltk_snowball_stemmer(): """ Test the snowball stemmer for a standard sentence """ stemmer = 'Snowball' expected_output = ['the', 'fox', 'was', 'quick', 'walk', 'by', 'the', 'seashor', 'in', 'the', 'morn', 'daylight.'] tokens = tokenizer.whitespace_tokenize(PHRASE_1, stemmer) assert expected_output == tokens
def find_tokenized_variety(output_token_dict, conditions, threshold, similarity_metric, stemmer, tokenizer_name): """ search for most relevant matches to query based on tokenization :param output_token_dict: The pre-known list of conditions :param conditions: The list of conditions to consider :param threshold: The minimum similarity to retrain :param similarity_metric: Which similarity strategy to use :param stemmer: Which stemmer to use :param tokenizer_name: Which tokenizer to use :type output_token_dict: dict :type conditions: list :type threshold: float :type similarity_metric: str :type stemmer: str :type tokenizer_name: str :return: the top search matches :rtype: list """ total_dict = {} for condition in conditions: condition = condition.strip().lower() if 'whitespace' == tokenizer_name: tokens = tokenizer.whitespace_tokenize(condition, stemmer) elif 'nltk' == tokenizer_name: tokens = tokenizer.nltk_tokenize(condition) else: tokens = [] for item in output_token_dict: item_tokens = output_token_dict[item] # here we compare tokens and item_tokens if 'cosine' == similarity_metric: similarity = metrics.cosine_similarity(tokens, item_tokens) elif 'jaccard' == similarity_metric: similarity = metrics.jaccard_similarity(tokens, item_tokens) else: similarity = metrics.harmonic_similarity(tokens, item_tokens) if similarity > threshold: # add the similarity so that we can rank descending if item in total_dict: if similarity > total_dict[item]: total_dict[item] = similarity else: total_dict[item] = similarity print(condition + ' -> ' + item) print(similarity) sorted_by_similarity = sorted(total_dict.items(), key=operator.itemgetter(1), reverse=True) return sorted_by_similarity
def test_whitespace_tokenize_none(): """ Test whitespace tokenization if None is received """ expected_result = [] assert tokenizer.whitespace_tokenize(None, None) == expected_result
def test_whitespace_tokenize_empty_sentence(): """ Test whitespace tokenization on an empty string """ expected_result = [] assert tokenizer.whitespace_tokenize('', None) == expected_result