def save_matrix_in_xlsx(cooc_matrix, pure_matrix, path_to_output_xlsx, workbook_name): # The worksheet will be saved in a excel workbook with the file name and location equal to the string below with MyExcelFileWrite(path_to_output_xlsx, workbook_name) as workbook: workbook.add_new_worksheet('pure_frequency_count') # Just a test worksheet with the pure frequency numbers of the pairs of verbs and nouns workbook.write_matrix_in_xlsx( 'pure_frequency_count', pure_matrix, utils.invert_dictionary(cooc_matrix.noun_rows), utils.invert_dictionary(cooc_matrix.verb_columns)) # In the lines below the worksheets will be created and associated to the workbook workbook.add_new_worksheet('cooc_matrix_full') workbook.add_new_worksheet('verb_filtered_arrays') # worksheet2 = get_new_worksheet('soc_pmi_matrix', workbook) inverted_matrix_noun_rows = utils.invert_dictionary( cooc_matrix.noun_rows) workbook.write_matrix_in_xlsx( 'cooc_matrix_full', cooc_matrix.matrix, inverted_matrix_noun_rows, utils.invert_dictionary(cooc_matrix.verb_columns)) ordered_verbs = utils.sort_dict(cooc_matrix.verb_filtered_arrays) ordered_verbs = [ ordered_verbs[l][0] for l in range(len(ordered_verbs)) ] workbook.write_verb_filtered_arrays('verb_filtered_arrays', cooc_matrix.verb_filtered_arrays, cooc_matrix.nouns_from_verb_arrays, ordered_verbs)
def filter_coocmatrix2(self): print('filter_coocmatrix2 started') verb_keys = self.verb_columns.keys() num_of_rows = self.matrix.shape[0] thirty_percent = int(np.ceil(0.3 * num_of_rows)) inverted_noun_rows_dict = utils.invert_dictionary(self.noun_rows) for verb in verb_keys: verb_index = self.verb_columns[verb] temp_column = self.matrix[:, verb_index] thirty_percent_largest_indices = heapq.nlargest( thirty_percent, range(num_of_rows), temp_column.take) most_co_occurring_nouns_values = list( temp_column[thirty_percent_largest_indices]) self.verb_filtered_arrays[verb] = most_co_occurring_nouns_values self.nouns_from_verb_arrays[verb] = [ inverted_noun_rows_dict[index] for index in thirty_percent_largest_indices ] print('filter_coocmatrix2 ended')
def get_20_percent_of_highest_pairs(self): # Gets the number that corresponds to 20% of the total of elements in the matrix twenty_percent = int( np.ceil(self.noun_rows_size * self.verb_columns_size * 0.2)) # Creates an empty numpy array with the size of the entire matrix temp_matrix_list = np.empty(self.noun_rows_size * self.verb_columns_size) x = 0 # Fit the matrix in a 1D array for i in self.matrix.flat: temp_matrix_list[x] = i x += 1 # Get a list of indices of the 20% most occurring items on the "1D matrix" thirty_percent_bigger_ind = heapq.nlargest( twenty_percent, range(self.noun_rows_size * self.verb_columns_size), temp_matrix_list.take) i = 0 real_aij_matrix_pos = [] # transform the obtained 1D-matrix's indices to the indices of the real matrix (2D indices) while i < twenty_percent: real_aij_matrix_pos.append([ thirty_percent_bigger_ind[i] // self.verb_columns_size, thirty_percent_bigger_ind[i] % self.verb_columns_size ]) i += 1 inverted_noun_rows = utils.invert_dictionary(self.noun_rows) inverted_verb_columns = utils.invert_dictionary(self.verb_columns) highest_values_list = [] for i, j in real_aij_matrix_pos: highest_values_list.append( (self.matrix[i][j], inverted_noun_rows[i], inverted_verb_columns[j])) return highest_values_list
avg_sim_matrix = noun_to_noun_sim_matrices['average_of_methods'] # avg_sim_matrix = np.zeros((42,42)) # i = 0 # for verb1 in matrix_columns_rows_index.keys(): # j = 0 # for verb2 in matrix_columns_rows_index.keys(): # avg_sim_matrix[i][j] = model.wv.similarity(verb1, verb2) # j += 1 # i += 1 workbook = xlsxUtils.MyExcelFileWrite('/home/paulojeunon/Desktop/', '42_verbs_similarity_wv.xlsx') workbook.add_new_worksheet('verbs_sim') inverted_dict = utils.invert_dictionary(matrix_columns_rows_index) workbook.write_matrix_in_xlsx('verbs_sim', avg_sim_matrix, inverted_dict, inverted_dict) workbook.close_workbook() matrix_cognitive_levels = {} for level in cts.names_of_cognitive_levels: matrix_cognitive_levels[level] = {} for level2 in cts.names_of_cognitive_levels: matrix_cognitive_levels[level][level2] = 0 for verb1 in blooms_verbs: for verb2 in blooms_verbs: verb1_level = utils.get_verb_cognitive_level(verb1, False) verb2_level = utils.get_verb_cognitive_level(verb2, False)
def create_verb_co_occurrence_matrix(): import nltk import heapq from nltk.corpus import wordnet as wn # stanford_server = utils.StanfordProcess(cts.home + 'systemScripts/runStanfordCoreNLP.sh') # stanford_server.start_process() lemmatizer = nltk.stem.WordNetLemmatizer() text_string_input = utils.read_text_input( cts.data['product_design_and_development']['path_to_input'] + 'input.txt', 'utf-8', True) sentences = nltk.tokenize.sent_tokenize(text_string_input) regexp_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+') tokenized_sentences = [ regexp_tokenizer.tokenize(sent) for sent in sentences ] tokenized_sentences = [ sent for sent in tokenized_sentences if len(sent) > 5 and len(sent) < 50 ] tagger = nltk.tag.stanford.CoreNLPPOSTagger(url='http://localhost:9000') tagged_sentences = [tagger.tag(sent) for sent in tokenized_sentences] verbs_in_each_sentence = [[ lemmatizer.lemmatize(word, 'v') for word, tag in sent if tag.startswith('V') ] for sent in tagged_sentences] v2i_row = {} i2v_row = {} v2i_column = {} i2v_column = {} index_row = 0 index_column = 0 for sent in verbs_in_each_sentence: skip_loop = True for verb in sent: if verb in cts.verbs_to_keep: skip_loop = False break if skip_loop: continue filtered_sent = [] for verb in sent: synset_list = wn.synsets(verb, pos='v') if synset_list: filtered_sent.append(verb) for verb in filtered_sent: if verb in cts.verbs_to_keep: if verb not in v2i_column: v2i_column[verb] = index_column i2v_column[index_row] = verb index_column += 1 else: if verb not in v2i_row: v2i_row[verb] = index_row i2v_row[index_row] = verb index_row += 1 v2i_column_temp = {} new_index = 0 for cog_level in cts.names_of_cognitive_levels: print(cog_level) for verb in cts.cognitive_levels[cog_level + '_verbs']: if verb in v2i_column: v2i_column_temp[verb] = new_index new_index += 1 v2i_column = v2i_column_temp i2v_column = utils.invert_dictionary(v2i_column) sim_matrix = np.zeros((len(v2i_row), len(v2i_column) + 1), dtype=float) for sent in verbs_in_each_sentence: if len(sent) > 1: for verb1 in sent: try: i = v2i_row[verb1] except: continue for verb2 in sent: try: j = v2i_column[verb2] sim_matrix[i][j] += 1.0 except: continue v2i_column['row_sum'] = len(i2v_column) i2v_column[v2i_column['row_sum']] = 'row_sum' for index in v2i_row.values(): sum_value = np.sum(sim_matrix[index]) sim_matrix[index] = np.divide(sim_matrix[index], sum_value) sim_matrix[index][v2i_column['row_sum']] = sum_value sum_column_index = v2i_column['row_sum'] sum_column = sim_matrix[:, sum_column_index] largests_row_indices = heapq.nlargest(len(sum_column), range(len(sum_column)), sum_column.take) ordered_row_verbs = [i2v_row[index] for index in largests_row_indices] sim_matrix_ordered = np.zeros((len(v2i_row), len(v2i_column))) for i in range(sim_matrix_ordered.shape[0]): sim_matrix_ordered[i] = sim_matrix[largests_row_indices[i]] v2i_row.clear() i2v_row.clear() for index, verb in enumerate(ordered_row_verbs): v2i_row[verb] = index i2v_row[index] = verb wb = xlsxUtils.MyExcelFileWrite( cts.home + '../', 'verb_co-occurrence_matrix_PDandD_42Xall.xlsx') wb.add_new_worksheet('matrix') wb.write_matrix_in_xlsx('matrix', sim_matrix_ordered, i2v_row, i2v_column) wb.close_workbook() cognitive_dist = {} for verb, i_index in v2i_row.items(): if sim_matrix_ordered[i_index][v2i_column['row_sum']] < 5: continue cognitive_dist[verb] = {} for cog_level_name, verbs in cts.cognitive_levels.items(): true_lvl_name = cog_level_name[:-6] for verb_42, j_index in v2i_column.items(): if verb_42 in verbs: if true_lvl_name in cognitive_dist[verb]: cognitive_dist[verb][ true_lvl_name] += sim_matrix_ordered[i_index][ j_index] else: cognitive_dist[verb][ true_lvl_name] = sim_matrix_ordered[i_index][ j_index] gdf_out = open(cts.home + '../' + '42VerbXall_graph_PDandD.gdf', 'w') gdf_out.write('nodedef>name VARCHAR\n') level_nodes = 'knowledge\ncomprehension\napplication\nanalysis\nsynthesis\nevaluation\n' gdf_out.write(level_nodes) for verb in cognitive_dist.keys(): gdf_out.write(verb + '\n') gdf_out.write('edgedef>node1 VARCHAR, node2 VARCHAR, weight FLOAT\n') for verb, cog_level_dict in cognitive_dist.items(): for lvl_name, value in cog_level_dict.items(): if value > 0.1: gdf_out.write(verb + ',' + lvl_name + ',' + str(value) + '\n') gdf_out.close()
def calculate_sim_matrix(self): print('calculate_sim_matrix started') self.noun_to_noun_sim_matrices.append( np.add( np.zeros((self.noun_rows_size, self.noun_rows_size), dtype=float), 0.01)) self.noun_to_noun_sim_matrices.append( np.add( np.zeros((self.noun_rows_size, self.noun_rows_size), dtype=float), 0.01)) self.noun_to_noun_sim_matrices.append( np.add( np.zeros((self.noun_rows_size, self.noun_rows_size), dtype=float), 0.01)) self.noun_to_noun_sim_matrices.append( np.add( np.zeros((self.noun_rows_size, self.noun_rows_size), dtype=float), 0.01)) self.noun_to_noun_sim_matrices.append( np.add( np.zeros((self.noun_rows_size, self.noun_rows_size), dtype=float), 0.01)) inverted_noun_dict = utils.invert_dictionary(self.noun_rows) brown_ic = wordnet_ic.ic('ic-brown.dat') for key in inverted_noun_dict: print(str(key) + ': ' + inverted_noun_dict[key]) i = 0 while i < (self.noun_rows_size - 1): j = i + 1 w1 = wordnet.synsets(inverted_noun_dict[i], pos=wordnet.NOUN) if not w1: print('Not able to find this noun: ' + inverted_noun_dict[i]) i += 1 continue w1 = w1[0] while j < self.noun_rows_size: w2 = wordnet.synsets(inverted_noun_dict[j], pos=wordnet.NOUN) if not w2: j += 1 continue w2 = w2[0] value = w1.wup_similarity(w2) value = utils.limit_value(value, 0.01, 1.0) self.noun_to_noun_sim_matrices[0][i][j] = value value = w1.lch_similarity(w2) / lch_maximum_obtained_value value = utils.limit_value(value, 0.01, 1.0) self.noun_to_noun_sim_matrices[1][i][j] = value value = w1.jcn_similarity(w2, brown_ic) value = utils.limit_value(value, 0.01, 1.0, True) self.noun_to_noun_sim_matrices[2][i][j] = value value = w1.lin_similarity(w2, brown_ic) value = utils.limit_value(value, 0.01, 1.0) self.noun_to_noun_sim_matrices[3][i][j] = value value = (self.noun_to_noun_sim_matrices[0][i][j] + self.noun_to_noun_sim_matrices[1][i][j] + self.noun_to_noun_sim_matrices[2][i][j] + self.noun_to_noun_sim_matrices[3][i][j]) / 4.0 value = utils.limit_value(value, 0.01, 1.0) self.noun_to_noun_sim_matrices[4][i][j] = value j += 1 print('sim_matrix: ' + str(i) + '\n') i += 1 print('calculate_sim_matrix ended')