コード例 #1
0
def bag_of_words_weighted(synset, term_dictionary: Counter) -> tuple:
    score = 0
    synset_context = set(
        parser.cleaning(get_context(synset), parser.LEMMER).keys())
    for word in synset_context:
        if term_dictionary[word] >= 0:
            score += term_dictionary[word]
    return synset, score
コード例 #2
0
ファイル: main.py プロジェクト: LucaPrg/TLN
def process(df, clean_method: str) -> list:
    processed = []
    for column in df:
        concept = ""
        for definition in df[column]:
            concept += definition
        cleaned = parser.cleaning(concept,
                                  clean_method,
                                  frequency=MIN_FREQUENCY,
                                  percentage=MOST_COMMON_PERCENTAGE)
        processed.append(cleaned)
    return processed
コード例 #3
0
def process_file(path: str) -> list:
    """
    Read the file and extract sentences, then cluster them
    :param path: file path to read
    :return: clustered Counter list of the sentences
    """
    file = open(path, "r")
    file_sentences = re.split('[.!?]', file.read())
    file.close()
    cleaned_sentences = []
    for sentence in file_sentences:
        sentence_counter = parser.cleaning(sentence, parser.LEMMER)
        if len(sentence_counter.keys()) > 0:
            cleaned_sentences.append(sentence_counter)

    return cleaned_sentences
コード例 #4
0
def lesk(word: str, sentence: str):
    """
    Find the best synset of the word for the given sentence
    :param word: word needs to be disambiguated
    :param sentence: used to disambiguate
    :return: synset with best intersection between phrase and word context
    """
    synsets = wordnet.synsets(word, pos=wordnet.NOUN)
    sentence = set(parser.cleaning(sentence=sentence, method=parser.LEMMER))
    best_score = 0
    try:
        best_synset = synsets[0]
    except:
        return None

    for synset in synsets:
        new_synset, new_score = bag_of_words(synset, sentence)
        if new_score > best_score:
            best_score = new_score
            best_synset = new_synset
    return best_synset
コード例 #5
0
def process(df, clean_method: str):
    value_table = {
        CONCRETE_G: [],
        CONCRETE_S: [],
        ABSTRACT_G: [],
        ABSTRACT_S: []
    }
    for column in df:
        processed = []
        for definition in df[column]:
            cleaned = parser.cleaning(definition, clean_method)
            if len(cleaned) == 0:
                continue
            processed.append(cleaned)

        if "concreto_generico" in column:
            value_table[CONCRETE_G] = processed.copy()
        elif "concreto_specifico" in column:
            value_table[CONCRETE_S] = processed.copy()
        elif "astratto_generico" in column:
            value_table[ABSTRACT_G] = processed.copy()
        elif "astratto_specifico" in column:
            value_table[ABSTRACT_S] = processed.copy()
    return value_table
コード例 #6
0
def bag_of_words(synset, term_dictionary: set) -> tuple:
    synset_context = set(
        parser.cleaning(get_context(synset), parser.LEMMER).keys())
    intersection = synset_context.intersection(term_dictionary)
    return synset, len(intersection)