Beispiel #1
0
 def __init__(self, type, key_words, secondary_words, excluding_words):
     self.type_name = type
     key_words_tree = Tree.AVLTree()
     self.key_words = key_words_tree.insert_array(key_words)
     secondary_words_tree = Tree.AVLTree()
     self.secondary_words = secondary_words_tree.insert_array(
         secondary_words)
     excluding_Words_tree = Tree.AVLTree()
     self.excluding_words = excluding_Words_tree.insert_array(
         excluding_words)
Beispiel #2
0
def palabras_repetidas_dictionary_with_tree(text_to_classify):
    import heapq
    import time
    start = time.time()

    print()
    print(" -------------------- Dictionary&Tree ----------------------- ")
    mongo_dictionary = DB.GET_dictionary_from_DB()  #from mongodb

    dictionary = []
    #como estan guardados por orden podemos cogerlos tal cual
    for i in range(Utils.MIN_TYPES, Utils.MAX_TYPES):
        key_words_tree = Tree.AVLTree()
        key_words_tree.insert_array(mongo_dictionary[i].key_words)
        secondary_words_tree = Tree.AVLTree()
        secondary_words_tree.insert_array(mongo_dictionary[i].secondary_words)
        excluding_words_tree = Tree.AVLTree()
        excluding_words_tree.insert_array(mongo_dictionary[i].excluding_words)
        dictionary.append(
            Utils.Dictionary(mongo_dictionary[i].type_name, key_words_tree,
                             secondary_words_tree, excluding_words_tree))

    # print('Write a text: ')
    # text_to_classify = input().lower()
    text_to_classify = Utils.delete_text_punctuation(text_to_classify)
    #the algorithm
    #while text_to_classify != '1' and text_to_classify != 'exit':
    key_words_value = []
    secondary_words_value = []
    excluding_words_value = []
    found_1words, found_2words, found_exwords = [], [], []
    empty_words_tree = Tree.AVLTree()
    empty_words_tree.insert_array(DB.GET_empty_words_from_DB())

    for sport in dictionary:
        #print('----------------------------------------------------------------------',sport.type_name)
        value, words = sport.key_words.find_words_in_text(
            text_to_classify, word_mark=1, empty_words_tree=empty_words_tree)
        key_words_value.append(value)
        found_1words.append(words)

        value, words = sport.secondary_words.find_words_in_text(
            text_to_classify,
            word_mark=0.25,
            empty_words_tree=empty_words_tree)
        secondary_words_value.append(value)
        found_2words.append(words)

        value, words = sport.excluding_words.find_words_in_text(
            text_to_classify, word_mark=1.5, empty_words_tree=empty_words_tree)
        excluding_words_value.append(value * -1)
        found_exwords.append(words)

    #print(key_words_value)
    #print(secondary_words_value[:])
    #print(excluding_words_value[:])
    i = 0
    for exclude_value in excluding_words_value:
        key_words_value[i] -= exclude_value
        secondary_words_value[i] -= exclude_value * 2
        i += 1
    max_values_key = heapq.nlargest(
        1, key_words_value)  # se escoge las dos mas altas
    max_values_secondary = heapq.nlargest(
        3, secondary_words_value)  # se escoge las dos mas altas

    i = 0
    # print('MAX VALUES KEY')
    if max_values_key[i] != 0:
        #words_repetidas_key = set(words_repetidas_key)

        print('Segun primary words:',
              Utils.get_data_name(key_words_value.index(max_values_key[0])),
              found_1words[key_words_value.index(max_values_key[0])])
    else:
        print('Ninguna key word encontrada')
    # for _ in max_values_key:
    #     print("->", max_values_key[i], "puntos -> ",
    #           Utils.get_data_name(key_words_value.index(max_values_key[i])))
    #     i += 1
    # i = 0
    #print('MAX VALUES SECONDARY')
    if max_values_secondary[i] != 0:
        #words_repetidas_secondary = set(words_repetidas_secondary)

        print(
            'Segun secondary words:',
            Utils.get_data_name(
                secondary_words_value.index(max_values_secondary[0])),
            found_2words[secondary_words_value.index(max_values_secondary[0])])
    else:
        print('Ninguna secondary word encontrada')
    # ok = 1
    # for _ in max_values_secondary:
    #     if ok == 1:
    #         print("|")
    #         print("v", "%0.2f" % max_values_secondary[i], "puntos -> ",
    #               Utils.get_data_name(secondary_words_value.index(max_values_secondary[i])))
    #     if i > 0:
    #         if Utils.get_data_name(secondary_words_value.index(max_values_secondary[i])) == Utils.get_data_name(secondary_words_value.index(max_values_secondary[i-1])):
    #             ok = 0
    #     i += 1

    # print('Write a text: ')
    # input_value = input().lower()
    # input_value = Utils.delete_text_punctuation(input_value)
    end = time.time()
    print('Ha tardo:', end - start, 'seg')