def bag_of_words_weighted(synset, term_dictionary: Counter) -> tuple: score = 0 synset_context = set( parser.cleaning(get_context(synset), parser.LEMMER).keys()) for word in synset_context: if term_dictionary[word] >= 0: score += term_dictionary[word] return synset, score
def process(df, clean_method: str) -> list: processed = [] for column in df: concept = "" for definition in df[column]: concept += definition cleaned = parser.cleaning(concept, clean_method, frequency=MIN_FREQUENCY, percentage=MOST_COMMON_PERCENTAGE) processed.append(cleaned) return processed
def process_file(path: str) -> list: """ Read the file and extract sentences, then cluster them :param path: file path to read :return: clustered Counter list of the sentences """ file = open(path, "r") file_sentences = re.split('[.!?]', file.read()) file.close() cleaned_sentences = [] for sentence in file_sentences: sentence_counter = parser.cleaning(sentence, parser.LEMMER) if len(sentence_counter.keys()) > 0: cleaned_sentences.append(sentence_counter) return cleaned_sentences
def lesk(word: str, sentence: str): """ Find the best synset of the word for the given sentence :param word: word needs to be disambiguated :param sentence: used to disambiguate :return: synset with best intersection between phrase and word context """ synsets = wordnet.synsets(word, pos=wordnet.NOUN) sentence = set(parser.cleaning(sentence=sentence, method=parser.LEMMER)) best_score = 0 try: best_synset = synsets[0] except: return None for synset in synsets: new_synset, new_score = bag_of_words(synset, sentence) if new_score > best_score: best_score = new_score best_synset = new_synset return best_synset
def process(df, clean_method: str): value_table = { CONCRETE_G: [], CONCRETE_S: [], ABSTRACT_G: [], ABSTRACT_S: [] } for column in df: processed = [] for definition in df[column]: cleaned = parser.cleaning(definition, clean_method) if len(cleaned) == 0: continue processed.append(cleaned) if "concreto_generico" in column: value_table[CONCRETE_G] = processed.copy() elif "concreto_specifico" in column: value_table[CONCRETE_S] = processed.copy() elif "astratto_generico" in column: value_table[ABSTRACT_G] = processed.copy() elif "astratto_specifico" in column: value_table[ABSTRACT_S] = processed.copy() return value_table
def bag_of_words(synset, term_dictionary: set) -> tuple: synset_context = set( parser.cleaning(get_context(synset), parser.LEMMER).keys()) intersection = synset_context.intersection(term_dictionary) return synset, len(intersection)