Beispiel #1
0
def save_matrix_in_xlsx(cooc_matrix, pure_matrix, path_to_output_xlsx,
                        workbook_name):
    # The worksheet will be saved in a excel workbook with the file name and location equal to the string below
    with MyExcelFileWrite(path_to_output_xlsx, workbook_name) as workbook:
        workbook.add_new_worksheet('pure_frequency_count')

        # Just a test worksheet with the pure frequency numbers of the pairs of verbs and nouns
        workbook.write_matrix_in_xlsx(
            'pure_frequency_count', pure_matrix,
            utils.invert_dictionary(cooc_matrix.noun_rows),
            utils.invert_dictionary(cooc_matrix.verb_columns))

        # In the lines below the worksheets will be created and associated to the workbook
        workbook.add_new_worksheet('cooc_matrix_full')
        workbook.add_new_worksheet('verb_filtered_arrays')

        # worksheet2 = get_new_worksheet('soc_pmi_matrix', workbook)

        inverted_matrix_noun_rows = utils.invert_dictionary(
            cooc_matrix.noun_rows)

        workbook.write_matrix_in_xlsx(
            'cooc_matrix_full', cooc_matrix.matrix, inverted_matrix_noun_rows,
            utils.invert_dictionary(cooc_matrix.verb_columns))

        ordered_verbs = utils.sort_dict(cooc_matrix.verb_filtered_arrays)
        ordered_verbs = [
            ordered_verbs[l][0] for l in range(len(ordered_verbs))
        ]
        workbook.write_verb_filtered_arrays('verb_filtered_arrays',
                                            cooc_matrix.verb_filtered_arrays,
                                            cooc_matrix.nouns_from_verb_arrays,
                                            ordered_verbs)
Beispiel #2
0
    def filter_coocmatrix2(self):

        print('filter_coocmatrix2 started')

        verb_keys = self.verb_columns.keys()

        num_of_rows = self.matrix.shape[0]
        thirty_percent = int(np.ceil(0.3 * num_of_rows))

        inverted_noun_rows_dict = utils.invert_dictionary(self.noun_rows)

        for verb in verb_keys:
            verb_index = self.verb_columns[verb]

            temp_column = self.matrix[:, verb_index]
            thirty_percent_largest_indices = heapq.nlargest(
                thirty_percent, range(num_of_rows), temp_column.take)

            most_co_occurring_nouns_values = list(
                temp_column[thirty_percent_largest_indices])

            self.verb_filtered_arrays[verb] = most_co_occurring_nouns_values
            self.nouns_from_verb_arrays[verb] = [
                inverted_noun_rows_dict[index]
                for index in thirty_percent_largest_indices
            ]

        print('filter_coocmatrix2 ended')
Beispiel #3
0
    def get_20_percent_of_highest_pairs(self):
        # Gets the number that corresponds to 20% of the total of elements in the matrix
        twenty_percent = int(
            np.ceil(self.noun_rows_size * self.verb_columns_size * 0.2))

        #  Creates an empty numpy array with the size of the entire matrix
        temp_matrix_list = np.empty(self.noun_rows_size *
                                    self.verb_columns_size)

        x = 0

        #  Fit the matrix in a 1D array
        for i in self.matrix.flat:
            temp_matrix_list[x] = i
            x += 1

        #  Get a list of indices of the 20% most occurring items on the "1D matrix"
        thirty_percent_bigger_ind = heapq.nlargest(
            twenty_percent,
            range(self.noun_rows_size * self.verb_columns_size),
            temp_matrix_list.take)

        i = 0
        real_aij_matrix_pos = []

        #  transform the obtained 1D-matrix's indices to the indices of the real matrix (2D indices)
        while i < twenty_percent:
            real_aij_matrix_pos.append([
                thirty_percent_bigger_ind[i] // self.verb_columns_size,
                thirty_percent_bigger_ind[i] % self.verb_columns_size
            ])
            i += 1

        inverted_noun_rows = utils.invert_dictionary(self.noun_rows)
        inverted_verb_columns = utils.invert_dictionary(self.verb_columns)

        highest_values_list = []
        for i, j in real_aij_matrix_pos:
            highest_values_list.append(
                (self.matrix[i][j], inverted_noun_rows[i],
                 inverted_verb_columns[j]))

        return highest_values_list
Beispiel #4
0
avg_sim_matrix = noun_to_noun_sim_matrices['average_of_methods']

# avg_sim_matrix = np.zeros((42,42))
# i = 0
# for verb1 in matrix_columns_rows_index.keys():
#     j = 0
#     for verb2 in matrix_columns_rows_index.keys():
#         avg_sim_matrix[i][j] = model.wv.similarity(verb1, verb2)
#         j += 1
#     i += 1

workbook = xlsxUtils.MyExcelFileWrite('/home/paulojeunon/Desktop/', '42_verbs_similarity_wv.xlsx')
workbook.add_new_worksheet('verbs_sim')

inverted_dict = utils.invert_dictionary(matrix_columns_rows_index)

workbook.write_matrix_in_xlsx('verbs_sim', avg_sim_matrix, inverted_dict, inverted_dict)
workbook.close_workbook()


matrix_cognitive_levels = {}
for level in cts.names_of_cognitive_levels:
    matrix_cognitive_levels[level] = {}
    for level2 in cts.names_of_cognitive_levels:
        matrix_cognitive_levels[level][level2] = 0

for verb1 in blooms_verbs:
    for verb2 in blooms_verbs:
        verb1_level = utils.get_verb_cognitive_level(verb1, False)
        verb2_level = utils.get_verb_cognitive_level(verb2, False)
Beispiel #5
0
def create_verb_co_occurrence_matrix():
    import nltk
    import heapq
    from nltk.corpus import wordnet as wn

    # stanford_server = utils.StanfordProcess(cts.home + 'systemScripts/runStanfordCoreNLP.sh')
    # stanford_server.start_process()

    lemmatizer = nltk.stem.WordNetLemmatizer()

    text_string_input = utils.read_text_input(
        cts.data['product_design_and_development']['path_to_input'] +
        'input.txt', 'utf-8', True)
    sentences = nltk.tokenize.sent_tokenize(text_string_input)

    regexp_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

    tokenized_sentences = [
        regexp_tokenizer.tokenize(sent) for sent in sentences
    ]
    tokenized_sentences = [
        sent for sent in tokenized_sentences
        if len(sent) > 5 and len(sent) < 50
    ]

    tagger = nltk.tag.stanford.CoreNLPPOSTagger(url='http://localhost:9000')

    tagged_sentences = [tagger.tag(sent) for sent in tokenized_sentences]
    verbs_in_each_sentence = [[
        lemmatizer.lemmatize(word, 'v') for word, tag in sent
        if tag.startswith('V')
    ] for sent in tagged_sentences]

    v2i_row = {}
    i2v_row = {}

    v2i_column = {}
    i2v_column = {}

    index_row = 0
    index_column = 0
    for sent in verbs_in_each_sentence:
        skip_loop = True
        for verb in sent:
            if verb in cts.verbs_to_keep:
                skip_loop = False
                break

        if skip_loop:
            continue

        filtered_sent = []
        for verb in sent:
            synset_list = wn.synsets(verb, pos='v')
            if synset_list:
                filtered_sent.append(verb)

        for verb in filtered_sent:
            if verb in cts.verbs_to_keep:
                if verb not in v2i_column:
                    v2i_column[verb] = index_column
                    i2v_column[index_row] = verb
                    index_column += 1
            else:
                if verb not in v2i_row:
                    v2i_row[verb] = index_row
                    i2v_row[index_row] = verb
                    index_row += 1

    v2i_column_temp = {}
    new_index = 0
    for cog_level in cts.names_of_cognitive_levels:
        print(cog_level)
        for verb in cts.cognitive_levels[cog_level + '_verbs']:
            if verb in v2i_column:
                v2i_column_temp[verb] = new_index
                new_index += 1

    v2i_column = v2i_column_temp
    i2v_column = utils.invert_dictionary(v2i_column)

    sim_matrix = np.zeros((len(v2i_row), len(v2i_column) + 1), dtype=float)

    for sent in verbs_in_each_sentence:
        if len(sent) > 1:
            for verb1 in sent:
                try:
                    i = v2i_row[verb1]
                except:
                    continue
                for verb2 in sent:
                    try:
                        j = v2i_column[verb2]
                        sim_matrix[i][j] += 1.0
                    except:
                        continue

    v2i_column['row_sum'] = len(i2v_column)
    i2v_column[v2i_column['row_sum']] = 'row_sum'

    for index in v2i_row.values():
        sum_value = np.sum(sim_matrix[index])
        sim_matrix[index] = np.divide(sim_matrix[index], sum_value)
        sim_matrix[index][v2i_column['row_sum']] = sum_value

    sum_column_index = v2i_column['row_sum']
    sum_column = sim_matrix[:, sum_column_index]
    largests_row_indices = heapq.nlargest(len(sum_column),
                                          range(len(sum_column)),
                                          sum_column.take)
    ordered_row_verbs = [i2v_row[index] for index in largests_row_indices]

    sim_matrix_ordered = np.zeros((len(v2i_row), len(v2i_column)))

    for i in range(sim_matrix_ordered.shape[0]):
        sim_matrix_ordered[i] = sim_matrix[largests_row_indices[i]]

    v2i_row.clear()
    i2v_row.clear()
    for index, verb in enumerate(ordered_row_verbs):
        v2i_row[verb] = index
        i2v_row[index] = verb

    wb = xlsxUtils.MyExcelFileWrite(
        cts.home + '../', 'verb_co-occurrence_matrix_PDandD_42Xall.xlsx')
    wb.add_new_worksheet('matrix')
    wb.write_matrix_in_xlsx('matrix', sim_matrix_ordered, i2v_row, i2v_column)
    wb.close_workbook()

    cognitive_dist = {}
    for verb, i_index in v2i_row.items():
        if sim_matrix_ordered[i_index][v2i_column['row_sum']] < 5:
            continue
        cognitive_dist[verb] = {}
        for cog_level_name, verbs in cts.cognitive_levels.items():
            true_lvl_name = cog_level_name[:-6]
            for verb_42, j_index in v2i_column.items():
                if verb_42 in verbs:
                    if true_lvl_name in cognitive_dist[verb]:
                        cognitive_dist[verb][
                            true_lvl_name] += sim_matrix_ordered[i_index][
                                j_index]
                    else:
                        cognitive_dist[verb][
                            true_lvl_name] = sim_matrix_ordered[i_index][
                                j_index]

    gdf_out = open(cts.home + '../' + '42VerbXall_graph_PDandD.gdf', 'w')
    gdf_out.write('nodedef>name VARCHAR\n')

    level_nodes = 'knowledge\ncomprehension\napplication\nanalysis\nsynthesis\nevaluation\n'
    gdf_out.write(level_nodes)

    for verb in cognitive_dist.keys():
        gdf_out.write(verb + '\n')

    gdf_out.write('edgedef>node1 VARCHAR, node2 VARCHAR, weight FLOAT\n')

    for verb, cog_level_dict in cognitive_dist.items():
        for lvl_name, value in cog_level_dict.items():
            if value > 0.1:
                gdf_out.write(verb + ',' + lvl_name + ',' + str(value) + '\n')

    gdf_out.close()
Beispiel #6
0
    def calculate_sim_matrix(self):

        print('calculate_sim_matrix started')

        self.noun_to_noun_sim_matrices.append(
            np.add(
                np.zeros((self.noun_rows_size, self.noun_rows_size),
                         dtype=float), 0.01))
        self.noun_to_noun_sim_matrices.append(
            np.add(
                np.zeros((self.noun_rows_size, self.noun_rows_size),
                         dtype=float), 0.01))
        self.noun_to_noun_sim_matrices.append(
            np.add(
                np.zeros((self.noun_rows_size, self.noun_rows_size),
                         dtype=float), 0.01))
        self.noun_to_noun_sim_matrices.append(
            np.add(
                np.zeros((self.noun_rows_size, self.noun_rows_size),
                         dtype=float), 0.01))
        self.noun_to_noun_sim_matrices.append(
            np.add(
                np.zeros((self.noun_rows_size, self.noun_rows_size),
                         dtype=float), 0.01))

        inverted_noun_dict = utils.invert_dictionary(self.noun_rows)

        brown_ic = wordnet_ic.ic('ic-brown.dat')

        for key in inverted_noun_dict:
            print(str(key) + ': ' + inverted_noun_dict[key])

        i = 0
        while i < (self.noun_rows_size - 1):
            j = i + 1
            w1 = wordnet.synsets(inverted_noun_dict[i], pos=wordnet.NOUN)
            if not w1:
                print('Not able to find this noun: ' + inverted_noun_dict[i])
                i += 1
                continue

            w1 = w1[0]

            while j < self.noun_rows_size:
                w2 = wordnet.synsets(inverted_noun_dict[j], pos=wordnet.NOUN)
                if not w2:
                    j += 1
                    continue

                w2 = w2[0]

                value = w1.wup_similarity(w2)
                value = utils.limit_value(value, 0.01, 1.0)
                self.noun_to_noun_sim_matrices[0][i][j] = value

                value = w1.lch_similarity(w2) / lch_maximum_obtained_value
                value = utils.limit_value(value, 0.01, 1.0)
                self.noun_to_noun_sim_matrices[1][i][j] = value

                value = w1.jcn_similarity(w2, brown_ic)
                value = utils.limit_value(value, 0.01, 1.0, True)
                self.noun_to_noun_sim_matrices[2][i][j] = value

                value = w1.lin_similarity(w2, brown_ic)
                value = utils.limit_value(value, 0.01, 1.0)
                self.noun_to_noun_sim_matrices[3][i][j] = value

                value = (self.noun_to_noun_sim_matrices[0][i][j] +
                         self.noun_to_noun_sim_matrices[1][i][j] +
                         self.noun_to_noun_sim_matrices[2][i][j] +
                         self.noun_to_noun_sim_matrices[3][i][j]) / 4.0

                value = utils.limit_value(value, 0.01, 1.0)

                self.noun_to_noun_sim_matrices[4][i][j] = value

                j += 1

            print('sim_matrix: ' + str(i) + '\n')
            i += 1

        print('calculate_sim_matrix ended')