def main(): #calc the 'Total citations' print("Calculating relative data of PoPCites.csv...") last2quarters_hyphen = func.get_last_2_quarters(connector='-') path_pop = func.get_abspath_pop() # # calc total citations df_pop = pd.read_csv(path_pop) popcites = df_pop.loc[(df_pop['Year'] > 2009 ),['Year','Authors','Title','Cites']] # filter conditions: Year > 2009 popcites['Title'] = popcites['Title'].str.lower() popcites['Year'] = popcites['Year'].astype(int) popcites_remove_duplicate = popcites.drop_duplicates(subset = 'Title') total_cites = popcites_remove_duplicate['Cites'].sum() path_pop_new = os.path.abspath('')+'\data\{}\PoPCites clean.csv'.format(last2quarters_hyphen) popcites_remove_duplicate.to_csv(path_pop_new,index=False,encoding='utf_8_sig') # automatically edit 'PoPCites_delta' halfyear_before_today = datetime.date.today()-datetime.timedelta(180) path_delta_pre = os.path.abspath('') + '\data\{}\citation_delta.csv'.format(func.get_last_2_quarters(date=halfyear_before_today,connector='-')) path_delta =os.path.abspath('')+'\data\{}\citation_delta.csv'.format(last2quarters_hyphen) df_delta = pd.read_csv(path_delta_pre,encoding='utf-8-sig') df_delta.replace(np.nan,0,inplace=True) df_delta.replace(np.inf,0,inplace=True) querydate_quarter = func.get_last_quarter(connector='_') querydate_quarter_pre = func.get_last_quarter(halfyear_before_today,connector='_') total_cites_pre = df_delta[df_delta['QueryDateQuarter']== querydate_quarter_pre].iloc[0]['Cites'] # idx_delta = df_delta.index[df_delta['QueryDateQuarter'] == querydate_quarter_pre].to_list()[0] + 1 delta_cites = total_cites - int(total_cites_pre) df_newrow = pd.DataFrame(data=[[querydate_quarter,total_cites,delta_cites]],columns=['QueryDateQuarter','Cites','delta']) if df_newrow['QueryDateQuarter'].isin(df_delta['QueryDateQuarter']).bool() == False: df_delta = df_delta.append(df_newrow,ignore_index=True) df_delta.to_csv(path_delta,index=False,encoding='utf-8-sig') print("Successfully calculated data and saved under path: {}".format(path_delta))
def main(): print('Cleaning Excel file from KIT Open...') # clean report of ISSD path_report_issd = func.get_abspath_report() path_report_clean_issd = func.get_abspath_folder_lastquarter() + 'report_{}_clean.xlsx'.format( func.get_last_2_quarters(connector='')) clean_report(path_report_issd,path_report_clean_issd) print('Successfully created cleaned report of ISSD under path: {}'.format(path_report_clean_issd)) # clean report of IISM path_report_iism = func.get_abspath_folder_lastquarter()+'report_im_{}.xlsx'.format(func.get_last_2_quarters()) path_report_clean_iism = func.get_abspath_folder_lastquarter() + 'report_im_{}_clean.xlsx'.format( func.get_last_2_quarters(connector='')) clean_report(path_report_iism, path_report_clean_iism) print('Successfully created cleaned report of IM under path: {}'.format(path_report_clean_issd))
def main(): # create folder if it does not exist yet path_lastqtr = func.get_abspath_folder_lastquarter() print(path_lastqtr) if not os.path.exists(path_lastqtr): os.makedirs(path_lastqtr) # execute relative modules # get and clean data search_string.main() search_string_im.main() get_KITopen.main() get_KITopen_im.main() clean_report.main() # predict tag predict_tag.main() # calculate PoPCites.scv while os.path.exists(path=func.get_abspath_folder_lastquarter() + "PoPCites.csv") == False: pause = input( "Please add PoPCites.csv of {} under the path: {} and then enter any values to continue" .format(func.get_last_2_quarters(), func.get_abspath_folder_lastquarter())) calc_cites_delta.main() # get author relationship and network get_data_network.main() get_network.main() # get wordcloud get_wordcloud.main()
def main(): print("Generating wordclouds of paper titles...") # check folder exists path_report_title = func.get_abspath_folder_lastquarter()+'report title' if not os.path.exists(path_report_title): os.makedirs(path_report_title) # get report and generate wordcloud of issd report path_report_issd = func.get_abspath_folder_lastquarter()+'\\report_{}_clean.xlsx'.format(func.get_last_2_quarters()) df_report_issd = pd.read_excel(path_report_issd) df_title_issd = df_report_issd['Titel'] df_title_issd.to_csv(path_report_title+'\\report_title_issd_{}.txt'.format(func.get_last_2_quarters()),index=False) path_title_issd = path_report_title+'\\report_title_issd_{}.txt'.format(func.get_last_2_quarters()) path_wordcloud_issd = func.get_abspath_folder_lastquarter()+'wordcloud_issd.png' get_wordcloud(path_title_issd,path_wordcloud_issd) # get report and report from last years lastyr = datetime.datetime.now().year - 1 for i in range(4): if i <= 3: df_title_issd_tmp = df_report_issd.loc[(df_report_issd['Erscheinungsjahr'] == lastyr-i),['Titel']] elif i == 4: df_title_issd_tmp = df_report_issd.loc[(df_report_issd['Erscheinungsjahr'] <= (lastyr-i)), ['Titel']] df_title_issd_tmp.to_csv(path_report_title+'\\report_title_issd_{}.txt'.format(lastyr-i),index=False) path_title_issd_tmp = path_report_title+'\\report_title_issd_{}.txt'.format(lastyr-i) path_wordcloud_issd_tmp = func.get_abspath_folder_lastquarter() + 'wordcloud_issd_{}.png'.format(lastyr-i) get_wordcloud(path_title_issd_tmp, path_wordcloud_issd_tmp,width_value=1000,height_value=600) #get report and generate .txt of iism report path_report_im = func.get_abspath_folder_lastquarter() + 'report_im_{}.xlsx'.format(func.get_last_2_quarters()) df_report_issd = pd.read_excel(path_report_im, sheet_name='Publikationen') df_title_issd = df_report_issd['Titel'] df_title_issd.to_csv(path_report_title+'\\report_title_im_{}.txt'.format(func.get_last_2_quarters()),index=False) path_title_iism = path_report_title+'\\report_title_im_{}.txt'.format(func.get_last_2_quarters()) path_wordcloud_im = func.get_abspath_folder_lastquarter() + 'wordcloud_im.png' get_wordcloud(path_title_iism, path_wordcloud_im) print("Successfully generated wordclous of paper titles")
def main(): print("Getting data of IM from KIT-Open...") path_search_str = os.path.abspath( '') + '\\researchers list\search string_im {}.txt'.format( func.get_last_2_quarters(connector='-')) with open(path_search_str, 'r', encoding='utf-8') as f: search_str = f.read() url_author_str = func.replace_name_url(search_str) url = 'https://publikationen.bibliothek.kit.edu/auswertungen/report.php?external_publications=all&open_access_availability=do_not_care&full_text=do_not_care&key_figures=number_of_publications&year=2010-&consider_online_advance_publication_date=true&consider_additional_pof_structures=false&row=type&column=year&authors='\ + url_author_str\ + '&in_opac=false&format=excel&publications=true' r = requests.get(url, allow_redirects=True) path_report = func.get_abspath_folder_lastquarter( ) + 'report_im_{}.xlsx'.format(func.get_last_2_quarters()) with open(path_report, 'wb') as f: f.write(r.content) print( "Successfully saved data of IM into Excel file under path: {}".format( path_report))
def main(): print("Getting data of authors...") # get namelist of authors path_namelist = func.get_abspath_researcher() namelist = pd.read_excel(path_namelist) firstname = namelist['First name'] lastname = namelist['Last name'] fullname_list = lastname + ', ' + firstname # format example: "Jasper, Feine" # print(fullname_list) # read report and get author list path_report = func.get_abspath_folder_lastquarter()+\ 'report_{}_clean.xlsx'.format(func.get_last_2_quarters(connector='')) df_report = pd.read_excel(path_report) df_author = df_report['Autor'] df_author_eng = func.replace_ger_char(df_author) # print(df_author_eng) # get matrix x = np.zeros(shape=(len(df_author_eng), len(fullname_list)), dtype=np.int, order='C') for i in range(len(df_author_eng)): for h in range(len(fullname_list)): if df_author_eng[i].find(fullname_list[h]) != -1: x[i, h] = 1 # print(x) # merge result and generate .xlsx df_result = pd.DataFrame(data=x, columns=fullname_list) # print(df_result) path_data_network = os.path.abspath( '') + '\data\{}\data_network_{}.xlsx'.format( func.get_last_2_quarters(connector='-'), func.get_last_2_quarters()) df_result.to_excel(path_data_network, index=False) print("Successfully saved author data under path: {}".format( path_data_network))
def main(): print("Getting search string of IM ...") # get team list url_team = "https://im.iism.kit.edu/team.php" html_team = func.get_html(url_team) soup_team = BeautifulSoup(html_team,"html.parser") last_name = [] first_name = [] for table in soup_team.find_all('table',class_="collapseTable"): caption = table.find('caption',align = 'top') # select only prof., postdocs, doctoral researcher and junior research if (str(caption.text).find("Leitung") != -1) \ or (str(caption.text).find("Forschungsgruppenleitung") != -1)\ or (str(caption.text).find("Wissenschaftliche Mitarbeiter*Innen am KIT und FZI") != -1)\ or (str(caption.text).find("Junior Researchers") != -1)\ : for tr in table.find_all('tr'): for a in tr.find_all('a',itemprop='name'): name = str(a.text) temp = name.split(", ") last_name.append(temp[0]) first_name.append(temp[1]) namelist = {'Last name':last_name,'First name':first_name} df_team_de = pd.DataFrame(namelist) df_team = df_team_de.copy() df_team = func.replace_ger_char(df_team) df_search = df_team.copy() last_name_flag_de = df_team_de['Last name'].isin(df_team['Last name']) first_name_flag_de = df_team_de['First name'].isin(df_team['First name']) for i in range(len(df_team_de)): if (last_name_flag_de[i]==False) or (first_name_flag_de[i]==False): df_search = df_search.append(df_team_de.loc[i],ignore_index=True) # generate search string and save it to txt-file search_str = '' for i in range(len(df_search)): if (str(df_search['Last name'][i]).find(' ')!= -1) or (str(df_search['First name'][i]).find(' ')!= -1): search_str +='"'+ df_search['Last name'][i] + ', '+ df_search['First name'][i] + '"' else: search_str += df_search['Last name'][i] + ', '+ df_search['First name'][i] if i != (len(df_search)-1): search_str += ' OR ' path_search_str = os.path.abspath('')+'\\researchers list\search string_im {}.txt'.format(func.get_last_2_quarters(connector='-')) with open(path_search_str,"w",encoding='utf-8') as f: f.write(search_str) print("Successfully created file of search string under path: {}".format(path_search_str))
def main(): print("Predicting blank KIT-tags in the report...") path_report_clean = func.get_abspath_folder_lastquarter( ) + 'report_{}_clean.xlsx'.format(func.get_last_2_quarters(connector='')) # path_report_clean = "ML-model training/test set.xlsx" df_report = pd.read_excel(path_report_clean) df_blank = df_report.loc[(df_report['KIT-Tagging'].isnull()), ['Publikationstyp', 'Autor', 'Quelle']] df_blank = df_blank.reset_index(drop=True) if len(df_blank) > 0: author_eng = func.replace_ger_char(df_blank['Autor']) # pub_typ = df_blank['Publikationstyp'] quelle = df_blank['Quelle'] # print(quelle) path_tag = 'MasterData/master_data_ranking_2020.xlsx' publication = pd.read_excel(path_tag, sheet_name='Publications') kit_tag = publication['kit_tag'] path_input = 'ML-model training/ML input.xlsx' df_input = pd.read_excel( path_input, sheet_name='top125')['Input'] #change version of input list # # **** quelle **** x = np.zeros(shape=(len(quelle), len(df_input)), dtype=np.int, order='C') for i in range(len(author_eng)): for h in range(len(df_input)): if quelle[i].find(df_input[h]) != -1: x[i, h] = 1 # ML prediction path_model = 'ML-model training/tag_model.h5' model = load_model(path_model) x_t = model.predict(x) df_top3 = pd.DataFrame(columns=['No.1', 'No.2', 'No.3']) for i in range(len(x_t)): top_k = 3 arr = x_t[i] top_k_idx = arr.argsort()[-top_k:][::-1] df_newrow = pd.DataFrame(data=[[ kit_tag[top_k_idx[0]], kit_tag[top_k_idx[1]], kit_tag[top_k_idx[2]], ]], columns=['No.1', 'No.2', 'No.3']) df_top3 = df_top3.append(df_newrow, ignore_index=True) # save result path_predict = func.get_abspath_folder_lastquarter( ) + 'tag_prediction_{}.xlsx'.format(func.get_last_2_quarters()) df_top3 = pd.concat([df_blank['Quelle'], df_top3], axis=1) df_top3.to_excel(path_predict, index=False) print(df_top3) print("Successfully saved predictions under path: {}".format( path_predict)) else: print("There aren't any blank KIT-tags in the report")
def main(): print("Getting search string of ISSD ...") # get latest team list url_team = "https://issd.iism.kit.edu/team.php" html_team = func.get_html(url_team) soup_team = BeautifulSoup(html_team,"html.parser") last_name = [] first_name = [] for table in soup_team.find_all('table',class_="collapseTable"): caption = table.find('caption',align = 'top') # select only prof., postdocs, doctoral researcher and junior research if (str(caption.text).find("Professor and Chairperson") != -1) \ or (str(caption.text).find("PostDocs") != -1)\ or (str(caption.text).find("Doctoral Researchers") != -1)\ or (str(caption.text).find("Junior Researchers") != -1)\ : for tr in table.find_all('tr'): for a in tr.find_all('a',itemprop='name'): name = str(a.text) temp = name.split(", ") last_name.append(temp[0]) first_name.append(temp[1]) namelist = {'Last name':last_name,'First name':first_name} df_team_de = pd.DataFrame(namelist) df_team_now = df_team_de.copy() df_team_now = func.replace_ger_char(df_team_now) # add team members who have already left (based on previous list) path_team_previous = os.path.abspath('') + \ '\\researchers list\\researchers list {}.xlsx'.format(func.get_last_2_quarters(date=datetime.date.today() - datetime.timedelta(180),connector='-')) df_team_previous = pd.read_excel(path_team_previous) df_team = pd.concat([df_team_now,df_team_previous],verify_integrity=True,ignore_index=True) # get HoF list path_hof = os.path.abspath('') + '\\researchers list\Hall of Fame.xlsx' df_hof = pd.read_excel(path_hof,sheet_name='HoF') # concat team list and HoF list and remove duplicates df_total = pd.concat([df_team,df_hof],verify_integrity=True,ignore_index=True) df_total = df_total.drop_duplicates() # save list and search string last2quarter_hyphen = func.get_last_2_quarters(connector='-') file_name = 'researchers list {}.xlsx'.format(last2quarter_hyphen) path_list = os.path.abspath('') + '\\researchers list\{}'.format(file_name) df_total.to_excel(excel_writer=path_list,index=False) # add german name to the dataframe df_search = df_total.copy() last_name_flag_de = df_team_de['Last name'].isin(df_total['Last name']) first_name_flag_de = df_team_de['First name'].isin(df_total['First name']) for i in range(len(df_team_de)): if (last_name_flag_de[i]==False) or (first_name_flag_de[i]==False): df_search = df_search.append(df_team_de.loc[i],ignore_index=True) # generate search string and save it to txt-file search_str = '' for i in range(len(df_search)): if (str(df_search['Last name'][i]).find(' ')!= -1) or (str(df_search['First name'][i]).find(' ')!= -1): search_str +='"'+ df_search['Last name'][i] + ', '+ df_search['First name'][i] + '"' else: search_str += df_search['Last name'][i] + ', '+ df_search['First name'][i] if i != (len(df_search)-1): search_str += ' OR ' path_search_str = os.path.abspath('') + '\\researchers list\search string {}.txt'.format(last2quarter_hyphen) with open(path_search_str,"w",encoding='utf-8') as f: f.write(search_str) print("Successfully created file of search string under path: {}".format(path_search_str))
def main(): print("Training machine learning model...") path_namelist = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + '\\researchers list\\researchers list {}.xlsx'.format(func.get_last_2_quarters(connector='-')) namelist = pd.read_excel(path_namelist) firstname = namelist['First name'] lastname = namelist['Last name'] fullname_list = lastname+', '+firstname # format example: "Jasper, Feine" pub_typ_list = ['Proceedingsbeitrag','Zeitschriftenaufsatz'] path_training = 'training set.xlsx' df_training = pd.read_excel(path_training, sheet_name='training') author = df_training['Autor'] pub_typ = df_training['Publikationstyp'] quelle = df_training['Quelle'] classes = df_training['tag_pub'] author_eng = replace_ger_char(author) # clean the data, transfer all the german characters to english path_input = 'ML input.xlsx' df_input = pd.read_excel(path_input,sheet_name='top125')['Input'] # **** quelle **** x = np.zeros(shape=(len(quelle),len(df_input)),dtype=np.int,order='C') for i in range(len(author_eng)): for h in range(len(df_input)): if quelle[i].find(df_input[h]) != -1: x[i,h] = 1 tag_file = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + '\MasterData\master_data_ranking_2020.xlsx' publication = pd.read_excel(tag_file, sheet_name='Publications') kit_tag = publication['kit_tag'] y = np.zeros(shape=(len(author_eng), len(kit_tag)), dtype=np.int, order='C') # result, there is only one '1' in each row to show the result z = np.zeros(shape=(len(author_eng)), dtype=np.int, order='C') sum = 0 for i in range(len(author_eng)): for j in range(len(kit_tag)): if classes[i] == kit_tag[j]: y[i][j] = 1 z[i] = j break if j == len(kit_tag) - 1: z[i] = -1 sum = sum + 1 x_train = np.zeros(shape=(len(author_eng)-sum, len(x[0])), dtype=np.int, order='C') x_test = np.zeros(shape=(sum, len(x[0])), dtype=np.int, order='C') y_train = np.zeros(shape=(len(author_eng)-sum, len(y[0])), dtype=np.int, order='C') y_test = np.zeros(shape=(sum, len(y[0])), dtype=np.int, order='C') index_train = 0 index_test = 0 for i in range(len(author_eng)): if z[i] == -1: x_test[index_test] = x[i] y_test[index_test] = y[i] index_test = index_test + 1 else: x_train[index_train] = x[i] y_train[index_train] = y[i] index_train = index_train + 1 input_sh = len(x[0]) output_len = len(kit_tag) model = Sequential() model.add(Dense(128, input_shape=(input_sh,), activation='relu')) model.add(Dropout(0.5)) model.add(Dense(256, activation='relu')) model.add(Dense(output_len, activation='softmax')) opt = keras.optimizers.Adam(learning_rate=0.01) model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy']) model.fit(x_train, y_train, epochs=200, batch_size=6) x_p = model.evaluate(x_train, y_train) save_dir = 'tag_model.h5' model.save(save_dir) print("Successfully saved model under the same folder")
def main(): print("Getting input value from latest report...") path_report = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + '\data\{}\\report_{}_clean.xlsx'.format(func.get_last_2_quarters(connector='-'),func.get_last_2_quarters()) df_report = pd.read_excel(path_report) quelle = df_report['Quelle'] # merge quelle into one string text_quelle = '' for single_que in quelle: text_quelle += single_que # print(text_quelle) # delete numbers and punctuation, keep letters only quelle_letters_only = re.sub("[^a-zA-Z]", " ", text_quelle) # print(quelle_letters_only) # tokenize and filter text stop_words = set(stopwords.words('english')) tokens = nltk.word_tokenize(quelle_letters_only) filtered_text = [w for w in tokens if not w in stop_words] # top 200 fdist = FreqDist(filtered_text) tops = fdist.most_common(200) df_tops = pd.DataFrame(tops) print(df_tops) df_tops.to_excel('freq_words_top{}.xlsx'.format(len(df_tops))) print("Successfully saved most frequent words into 'freq_words_top{}.xlsx' under this folder".format(len(df_tops)))
def main(): print("Generating cooperation network of authors...") path_data_network = func.get_abspath_folder_lastquarter( ) + 'data_network_{}.xlsx'.format(func.get_last_2_quarters()) df_result = pd.read_excel(path_data_network) # print(df_result) # get namelist path_namelist = func.get_abspath_researcher() namelist = pd.read_excel(path_namelist) firstname = namelist['First name'] lastname = namelist['Last name'] fullname_list = lastname + ', ' + firstname fullname_list = fullname_list.drop([0]) # print(fullname_list) # edges dict_edges = {} for fullname in fullname_list: dict_temp = {} df_temp = df_result.loc[(df_result[fullname] == 1)] for fullname2 in fullname_list: if fullname2 != fullname: dict_temp[fullname2] = df_temp[fullname2].sum() dict_edges[fullname] = dict_temp # print(dict_edges) # nodes dict_nodes = {} for fullname in fullname_list: df_temp = df_result.loc[(df_result[fullname] == 1)] dict_nodes[fullname] = df_temp[fullname].sum() # print(dict_nodes) # network nw_author = nx.Graph() for node in dict_nodes.keys(): #nodes if dict_nodes[node] > 0: nw_author.add_node(node, size=dict_nodes[node]) for edge in dict_edges.keys(): for co_edge in dict_edges[edge].keys(): if dict_edges[edge][co_edge] > 0: nw_author.add_edge(edge, co_edge, weight=dict_edges[edge][co_edge]) # get positions pos_ = nx.spring_layout(nw_author) # make edge trace edge_trace = [] text_trace = [] for edge in nw_author.edges(): if nw_author.edges()[edge]['weight'] > 0: char_1 = edge[0] char_2 = edge[1] x0, y0 = pos_[char_1] x1, y1 = pos_[char_2] text = char_2 + ': ' + str(nw_author.edges()[edge]['weight']) edge_trace_tmp = make_edge( [x0, x1, None], [y0, y1, None], text, width=nw_author.edges()[edge]['weight']**0.5) edge_trace.append(edge_trace_tmp) for edge in nw_author.edges(): if nw_author.edges()[edge]['weight'] > 0: char_1 = edge[1] char_2 = edge[0] x0, y0 = pos_[char_1] x1, y1 = pos_[char_2] text = char_2 + ': ' + str(nw_author.edges()[edge]['weight']) edge_trace_tmp = make_edge( [x0, x1, None], [y0, y1, None], text, width=nw_author.edges()[edge]['weight']**0.5) edge_trace.append(edge_trace_tmp) # make node trace node_trace = go.Scatter(x=[], y=[], text=[], textposition="top center", textfont_size=10, mode='markers+text', hoverinfo='none', marker=dict(color=[], size=[], line=None)) for node in nw_author.nodes(): x, y = pos_[node] node_trace['x'] += tuple([x]) node_trace['y'] += tuple([y]) node_trace['marker']['color'] += tuple([color_nw]) # node_trace['marker']['size'] += tuple([5*nw_author.nodes()[node]['size']]) node_trace['text'] += tuple(['<b>' + node + '</b>']) # customize layout layout = go.Layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)', hovermode='x') fig = go.Figure(layout=layout) for trace in edge_trace: fig.add_trace(trace) fig.add_trace(node_trace) fig.update_layout(showlegend=False) fig.update_xaxes(showticklabels=False) fig.update_yaxes(showticklabels=False) path_html = func.get_abspath_folder_lastquarter() + "network_author.html" fig.write_html(path_html) print("Successfully generated cooperation network under path: {}".format( path_html))