def main(): # create folder if it does not exist yet path_lastqtr = func.get_abspath_folder_lastquarter() print(path_lastqtr) if not os.path.exists(path_lastqtr): os.makedirs(path_lastqtr) # execute relative modules # get and clean data search_string.main() search_string_im.main() get_KITopen.main() get_KITopen_im.main() clean_report.main() # predict tag predict_tag.main() # calculate PoPCites.scv while os.path.exists(path=func.get_abspath_folder_lastquarter() + "PoPCites.csv") == False: pause = input( "Please add PoPCites.csv of {} under the path: {} and then enter any values to continue" .format(func.get_last_2_quarters(), func.get_abspath_folder_lastquarter())) calc_cites_delta.main() # get author relationship and network get_data_network.main() get_network.main() # get wordcloud get_wordcloud.main()
def main(): print('Cleaning Excel file from KIT Open...') # clean report of ISSD path_report_issd = func.get_abspath_report() path_report_clean_issd = func.get_abspath_folder_lastquarter() + 'report_{}_clean.xlsx'.format( func.get_last_2_quarters(connector='')) clean_report(path_report_issd,path_report_clean_issd) print('Successfully created cleaned report of ISSD under path: {}'.format(path_report_clean_issd)) # clean report of IISM path_report_iism = func.get_abspath_folder_lastquarter()+'report_im_{}.xlsx'.format(func.get_last_2_quarters()) path_report_clean_iism = func.get_abspath_folder_lastquarter() + 'report_im_{}_clean.xlsx'.format( func.get_last_2_quarters(connector='')) clean_report(path_report_iism, path_report_clean_iism) print('Successfully created cleaned report of IM under path: {}'.format(path_report_clean_issd))
def main(): print("Getting data of IM from KIT-Open...") path_search_str = os.path.abspath( '') + '\\researchers list\search string_im {}.txt'.format( func.get_last_2_quarters(connector='-')) with open(path_search_str, 'r', encoding='utf-8') as f: search_str = f.read() url_author_str = func.replace_name_url(search_str) url = 'https://publikationen.bibliothek.kit.edu/auswertungen/report.php?external_publications=all&open_access_availability=do_not_care&full_text=do_not_care&key_figures=number_of_publications&year=2010-&consider_online_advance_publication_date=true&consider_additional_pof_structures=false&row=type&column=year&authors='\ + url_author_str\ + '&in_opac=false&format=excel&publications=true' r = requests.get(url, allow_redirects=True) path_report = func.get_abspath_folder_lastquarter( ) + 'report_im_{}.xlsx'.format(func.get_last_2_quarters()) with open(path_report, 'wb') as f: f.write(r.content) print( "Successfully saved data of IM into Excel file under path: {}".format( path_report))
def main(): print("Getting data of authors...") # get namelist of authors path_namelist = func.get_abspath_researcher() namelist = pd.read_excel(path_namelist) firstname = namelist['First name'] lastname = namelist['Last name'] fullname_list = lastname + ', ' + firstname # format example: "Jasper, Feine" # print(fullname_list) # read report and get author list path_report = func.get_abspath_folder_lastquarter()+\ 'report_{}_clean.xlsx'.format(func.get_last_2_quarters(connector='')) df_report = pd.read_excel(path_report) df_author = df_report['Autor'] df_author_eng = func.replace_ger_char(df_author) # print(df_author_eng) # get matrix x = np.zeros(shape=(len(df_author_eng), len(fullname_list)), dtype=np.int, order='C') for i in range(len(df_author_eng)): for h in range(len(fullname_list)): if df_author_eng[i].find(fullname_list[h]) != -1: x[i, h] = 1 # print(x) # merge result and generate .xlsx df_result = pd.DataFrame(data=x, columns=fullname_list) # print(df_result) path_data_network = os.path.abspath( '') + '\data\{}\data_network_{}.xlsx'.format( func.get_last_2_quarters(connector='-'), func.get_last_2_quarters()) df_result.to_excel(path_data_network, index=False) print("Successfully saved author data under path: {}".format( path_data_network))
def main(): print("Generating wordclouds of paper titles...") # check folder exists path_report_title = func.get_abspath_folder_lastquarter()+'report title' if not os.path.exists(path_report_title): os.makedirs(path_report_title) # get report and generate wordcloud of issd report path_report_issd = func.get_abspath_folder_lastquarter()+'\\report_{}_clean.xlsx'.format(func.get_last_2_quarters()) df_report_issd = pd.read_excel(path_report_issd) df_title_issd = df_report_issd['Titel'] df_title_issd.to_csv(path_report_title+'\\report_title_issd_{}.txt'.format(func.get_last_2_quarters()),index=False) path_title_issd = path_report_title+'\\report_title_issd_{}.txt'.format(func.get_last_2_quarters()) path_wordcloud_issd = func.get_abspath_folder_lastquarter()+'wordcloud_issd.png' get_wordcloud(path_title_issd,path_wordcloud_issd) # get report and report from last years lastyr = datetime.datetime.now().year - 1 for i in range(4): if i <= 3: df_title_issd_tmp = df_report_issd.loc[(df_report_issd['Erscheinungsjahr'] == lastyr-i),['Titel']] elif i == 4: df_title_issd_tmp = df_report_issd.loc[(df_report_issd['Erscheinungsjahr'] <= (lastyr-i)), ['Titel']] df_title_issd_tmp.to_csv(path_report_title+'\\report_title_issd_{}.txt'.format(lastyr-i),index=False) path_title_issd_tmp = path_report_title+'\\report_title_issd_{}.txt'.format(lastyr-i) path_wordcloud_issd_tmp = func.get_abspath_folder_lastquarter() + 'wordcloud_issd_{}.png'.format(lastyr-i) get_wordcloud(path_title_issd_tmp, path_wordcloud_issd_tmp,width_value=1000,height_value=600) #get report and generate .txt of iism report path_report_im = func.get_abspath_folder_lastquarter() + 'report_im_{}.xlsx'.format(func.get_last_2_quarters()) df_report_issd = pd.read_excel(path_report_im, sheet_name='Publikationen') df_title_issd = df_report_issd['Titel'] df_title_issd.to_csv(path_report_title+'\\report_title_im_{}.txt'.format(func.get_last_2_quarters()),index=False) path_title_iism = path_report_title+'\\report_title_im_{}.txt'.format(func.get_last_2_quarters()) path_wordcloud_im = func.get_abspath_folder_lastquarter() + 'wordcloud_im.png' get_wordcloud(path_title_iism, path_wordcloud_im) print("Successfully generated wordclous of paper titles")
def main(): print("Predicting blank KIT-tags in the report...") path_report_clean = func.get_abspath_folder_lastquarter( ) + 'report_{}_clean.xlsx'.format(func.get_last_2_quarters(connector='')) # path_report_clean = "ML-model training/test set.xlsx" df_report = pd.read_excel(path_report_clean) df_blank = df_report.loc[(df_report['KIT-Tagging'].isnull()), ['Publikationstyp', 'Autor', 'Quelle']] df_blank = df_blank.reset_index(drop=True) if len(df_blank) > 0: author_eng = func.replace_ger_char(df_blank['Autor']) # pub_typ = df_blank['Publikationstyp'] quelle = df_blank['Quelle'] # print(quelle) path_tag = 'MasterData/master_data_ranking_2020.xlsx' publication = pd.read_excel(path_tag, sheet_name='Publications') kit_tag = publication['kit_tag'] path_input = 'ML-model training/ML input.xlsx' df_input = pd.read_excel( path_input, sheet_name='top125')['Input'] #change version of input list # # **** quelle **** x = np.zeros(shape=(len(quelle), len(df_input)), dtype=np.int, order='C') for i in range(len(author_eng)): for h in range(len(df_input)): if quelle[i].find(df_input[h]) != -1: x[i, h] = 1 # ML prediction path_model = 'ML-model training/tag_model.h5' model = load_model(path_model) x_t = model.predict(x) df_top3 = pd.DataFrame(columns=['No.1', 'No.2', 'No.3']) for i in range(len(x_t)): top_k = 3 arr = x_t[i] top_k_idx = arr.argsort()[-top_k:][::-1] df_newrow = pd.DataFrame(data=[[ kit_tag[top_k_idx[0]], kit_tag[top_k_idx[1]], kit_tag[top_k_idx[2]], ]], columns=['No.1', 'No.2', 'No.3']) df_top3 = df_top3.append(df_newrow, ignore_index=True) # save result path_predict = func.get_abspath_folder_lastquarter( ) + 'tag_prediction_{}.xlsx'.format(func.get_last_2_quarters()) df_top3 = pd.concat([df_blank['Quelle'], df_top3], axis=1) df_top3.to_excel(path_predict, index=False) print(df_top3) print("Successfully saved predictions under path: {}".format( path_predict)) else: print("There aren't any blank KIT-tags in the report")
def main(): print("Generating cooperation network of authors...") path_data_network = func.get_abspath_folder_lastquarter( ) + 'data_network_{}.xlsx'.format(func.get_last_2_quarters()) df_result = pd.read_excel(path_data_network) # print(df_result) # get namelist path_namelist = func.get_abspath_researcher() namelist = pd.read_excel(path_namelist) firstname = namelist['First name'] lastname = namelist['Last name'] fullname_list = lastname + ', ' + firstname fullname_list = fullname_list.drop([0]) # print(fullname_list) # edges dict_edges = {} for fullname in fullname_list: dict_temp = {} df_temp = df_result.loc[(df_result[fullname] == 1)] for fullname2 in fullname_list: if fullname2 != fullname: dict_temp[fullname2] = df_temp[fullname2].sum() dict_edges[fullname] = dict_temp # print(dict_edges) # nodes dict_nodes = {} for fullname in fullname_list: df_temp = df_result.loc[(df_result[fullname] == 1)] dict_nodes[fullname] = df_temp[fullname].sum() # print(dict_nodes) # network nw_author = nx.Graph() for node in dict_nodes.keys(): #nodes if dict_nodes[node] > 0: nw_author.add_node(node, size=dict_nodes[node]) for edge in dict_edges.keys(): for co_edge in dict_edges[edge].keys(): if dict_edges[edge][co_edge] > 0: nw_author.add_edge(edge, co_edge, weight=dict_edges[edge][co_edge]) # get positions pos_ = nx.spring_layout(nw_author) # make edge trace edge_trace = [] text_trace = [] for edge in nw_author.edges(): if nw_author.edges()[edge]['weight'] > 0: char_1 = edge[0] char_2 = edge[1] x0, y0 = pos_[char_1] x1, y1 = pos_[char_2] text = char_2 + ': ' + str(nw_author.edges()[edge]['weight']) edge_trace_tmp = make_edge( [x0, x1, None], [y0, y1, None], text, width=nw_author.edges()[edge]['weight']**0.5) edge_trace.append(edge_trace_tmp) for edge in nw_author.edges(): if nw_author.edges()[edge]['weight'] > 0: char_1 = edge[1] char_2 = edge[0] x0, y0 = pos_[char_1] x1, y1 = pos_[char_2] text = char_2 + ': ' + str(nw_author.edges()[edge]['weight']) edge_trace_tmp = make_edge( [x0, x1, None], [y0, y1, None], text, width=nw_author.edges()[edge]['weight']**0.5) edge_trace.append(edge_trace_tmp) # make node trace node_trace = go.Scatter(x=[], y=[], text=[], textposition="top center", textfont_size=10, mode='markers+text', hoverinfo='none', marker=dict(color=[], size=[], line=None)) for node in nw_author.nodes(): x, y = pos_[node] node_trace['x'] += tuple([x]) node_trace['y'] += tuple([y]) node_trace['marker']['color'] += tuple([color_nw]) # node_trace['marker']['size'] += tuple([5*nw_author.nodes()[node]['size']]) node_trace['text'] += tuple(['<b>' + node + '</b>']) # customize layout layout = go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', hovermode='x') fig = go.Figure(layout=layout) for trace in edge_trace: fig.add_trace(trace) fig.add_trace(node_trace) fig.update_layout(showlegend=False) fig.update_xaxes(showticklabels=False) fig.update_yaxes(showticklabels=False) path_html = func.get_abspath_folder_lastquarter() + "network_author.html" fig.write_html(path_html) print("Successfully generated cooperation network under path: {}".format( path_html))