Ejemplo n.º 1
0
def main():
    # create folder if it does not exist yet
    path_lastqtr = func.get_abspath_folder_lastquarter()
    print(path_lastqtr)
    if not os.path.exists(path_lastqtr):
        os.makedirs(path_lastqtr)

    # execute relative modules
    #   get and clean data
    search_string.main()
    search_string_im.main()
    get_KITopen.main()
    get_KITopen_im.main()
    clean_report.main()

    # predict tag
    predict_tag.main()

    #   calculate PoPCites.scv
    while os.path.exists(path=func.get_abspath_folder_lastquarter() +
                         "PoPCites.csv") == False:
        pause = input(
            "Please add PoPCites.csv of {} under the path: {} and then enter any values to continue"
            .format(func.get_last_2_quarters(),
                    func.get_abspath_folder_lastquarter()))
    calc_cites_delta.main()

    #   get author relationship and network
    get_data_network.main()
    get_network.main()

    #   get wordcloud
    get_wordcloud.main()
Ejemplo n.º 2
0
def main():
    print('Cleaning Excel file from KIT Open...')

    # clean report of ISSD
    path_report_issd = func.get_abspath_report()
    path_report_clean_issd = func.get_abspath_folder_lastquarter() + 'report_{}_clean.xlsx'.format(
        func.get_last_2_quarters(connector=''))
    clean_report(path_report_issd,path_report_clean_issd)
    print('Successfully created cleaned report of ISSD under path: {}'.format(path_report_clean_issd))

    # clean report of IISM
    path_report_iism = func.get_abspath_folder_lastquarter()+'report_im_{}.xlsx'.format(func.get_last_2_quarters())
    path_report_clean_iism = func.get_abspath_folder_lastquarter() + 'report_im_{}_clean.xlsx'.format(
        func.get_last_2_quarters(connector=''))
    clean_report(path_report_iism, path_report_clean_iism)
    print('Successfully created cleaned report of IM under path: {}'.format(path_report_clean_issd))
def main():
    print("Getting data of IM from KIT-Open...")

    path_search_str = os.path.abspath(
        '') + '\\researchers list\search string_im {}.txt'.format(
            func.get_last_2_quarters(connector='-'))
    with open(path_search_str, 'r', encoding='utf-8') as f:
        search_str = f.read()
    url_author_str = func.replace_name_url(search_str)
    url = 'https://publikationen.bibliothek.kit.edu/auswertungen/report.php?external_publications=all&open_access_availability=do_not_care&full_text=do_not_care&key_figures=number_of_publications&year=2010-&consider_online_advance_publication_date=true&consider_additional_pof_structures=false&row=type&column=year&authors='\
            + url_author_str\
            + '&in_opac=false&format=excel&publications=true'
    r = requests.get(url, allow_redirects=True)
    path_report = func.get_abspath_folder_lastquarter(
    ) + 'report_im_{}.xlsx'.format(func.get_last_2_quarters())
    with open(path_report, 'wb') as f:
        f.write(r.content)

    print(
        "Successfully saved data of IM into Excel file under path: {}".format(
            path_report))
Ejemplo n.º 4
0
def main():
    print("Getting data of authors...")

    # get namelist of authors
    path_namelist = func.get_abspath_researcher()
    namelist = pd.read_excel(path_namelist)
    firstname = namelist['First name']
    lastname = namelist['Last name']
    fullname_list = lastname + ', ' + firstname  # format example: "Jasper, Feine"
    # print(fullname_list)

    # read report and get author list
    path_report = func.get_abspath_folder_lastquarter()+\
                  'report_{}_clean.xlsx'.format(func.get_last_2_quarters(connector=''))
    df_report = pd.read_excel(path_report)
    df_author = df_report['Autor']
    df_author_eng = func.replace_ger_char(df_author)
    # print(df_author_eng)

    # get matrix
    x = np.zeros(shape=(len(df_author_eng), len(fullname_list)),
                 dtype=np.int,
                 order='C')
    for i in range(len(df_author_eng)):
        for h in range(len(fullname_list)):
            if df_author_eng[i].find(fullname_list[h]) != -1:
                x[i, h] = 1
    # print(x)

    # merge result and generate .xlsx
    df_result = pd.DataFrame(data=x, columns=fullname_list)
    # print(df_result)
    path_data_network = os.path.abspath(
        '') + '\data\{}\data_network_{}.xlsx'.format(
            func.get_last_2_quarters(connector='-'),
            func.get_last_2_quarters())
    df_result.to_excel(path_data_network, index=False)

    print("Successfully saved author data under path: {}".format(
        path_data_network))
def main():
    print("Generating wordclouds of paper titles...")

    # check folder exists
    path_report_title = func.get_abspath_folder_lastquarter()+'report title'
    if not os.path.exists(path_report_title):
        os.makedirs(path_report_title)

    # get report and generate wordcloud of issd report
    path_report_issd = func.get_abspath_folder_lastquarter()+'\\report_{}_clean.xlsx'.format(func.get_last_2_quarters())
    df_report_issd = pd.read_excel(path_report_issd)
    df_title_issd = df_report_issd['Titel']
    df_title_issd.to_csv(path_report_title+'\\report_title_issd_{}.txt'.format(func.get_last_2_quarters()),index=False)
    path_title_issd = path_report_title+'\\report_title_issd_{}.txt'.format(func.get_last_2_quarters())
    path_wordcloud_issd = func.get_abspath_folder_lastquarter()+'wordcloud_issd.png'

    get_wordcloud(path_title_issd,path_wordcloud_issd)

    # get report and report from last years
    lastyr = datetime.datetime.now().year - 1
    for i in range(4):
        if i <= 3:
            df_title_issd_tmp = df_report_issd.loc[(df_report_issd['Erscheinungsjahr'] == lastyr-i),['Titel']]
        elif i == 4:
            df_title_issd_tmp = df_report_issd.loc[(df_report_issd['Erscheinungsjahr'] <= (lastyr-i)), ['Titel']]
        df_title_issd_tmp.to_csv(path_report_title+'\\report_title_issd_{}.txt'.format(lastyr-i),index=False)
        path_title_issd_tmp = path_report_title+'\\report_title_issd_{}.txt'.format(lastyr-i)
        path_wordcloud_issd_tmp = func.get_abspath_folder_lastquarter() + 'wordcloud_issd_{}.png'.format(lastyr-i)

        get_wordcloud(path_title_issd_tmp, path_wordcloud_issd_tmp,width_value=1000,height_value=600)

    #get report and generate .txt of iism report
    path_report_im = func.get_abspath_folder_lastquarter() + 'report_im_{}.xlsx'.format(func.get_last_2_quarters())
    df_report_issd = pd.read_excel(path_report_im, sheet_name='Publikationen')
    df_title_issd = df_report_issd['Titel']
    df_title_issd.to_csv(path_report_title+'\\report_title_im_{}.txt'.format(func.get_last_2_quarters()),index=False)
    path_title_iism = path_report_title+'\\report_title_im_{}.txt'.format(func.get_last_2_quarters())
    path_wordcloud_im = func.get_abspath_folder_lastquarter() + 'wordcloud_im.png'

    get_wordcloud(path_title_iism, path_wordcloud_im)

    print("Successfully generated wordclous of paper titles")
Ejemplo n.º 6
0
def main():
    print("Predicting blank KIT-tags in the report...")

    path_report_clean = func.get_abspath_folder_lastquarter(
    ) + 'report_{}_clean.xlsx'.format(func.get_last_2_quarters(connector=''))
    # path_report_clean = "ML-model training/test set.xlsx"
    df_report = pd.read_excel(path_report_clean)
    df_blank = df_report.loc[(df_report['KIT-Tagging'].isnull()),
                             ['Publikationstyp', 'Autor', 'Quelle']]
    df_blank = df_blank.reset_index(drop=True)

    if len(df_blank) > 0:
        author_eng = func.replace_ger_char(df_blank['Autor'])
        # pub_typ = df_blank['Publikationstyp']
        quelle = df_blank['Quelle']
        # print(quelle)

        path_tag = 'MasterData/master_data_ranking_2020.xlsx'
        publication = pd.read_excel(path_tag, sheet_name='Publications')
        kit_tag = publication['kit_tag']

        path_input = 'ML-model training/ML input.xlsx'
        df_input = pd.read_excel(
            path_input,
            sheet_name='top125')['Input']  #change version of input list
        # # **** quelle ****
        x = np.zeros(shape=(len(quelle), len(df_input)),
                     dtype=np.int,
                     order='C')
        for i in range(len(author_eng)):
            for h in range(len(df_input)):
                if quelle[i].find(df_input[h]) != -1:
                    x[i, h] = 1

        # ML prediction
        path_model = 'ML-model training/tag_model.h5'
        model = load_model(path_model)
        x_t = model.predict(x)
        df_top3 = pd.DataFrame(columns=['No.1', 'No.2', 'No.3'])
        for i in range(len(x_t)):
            top_k = 3
            arr = x_t[i]
            top_k_idx = arr.argsort()[-top_k:][::-1]
            df_newrow = pd.DataFrame(data=[[
                kit_tag[top_k_idx[0]],
                kit_tag[top_k_idx[1]],
                kit_tag[top_k_idx[2]],
            ]],
                                     columns=['No.1', 'No.2', 'No.3'])
            df_top3 = df_top3.append(df_newrow, ignore_index=True)

        # save result
        path_predict = func.get_abspath_folder_lastquarter(
        ) + 'tag_prediction_{}.xlsx'.format(func.get_last_2_quarters())
        df_top3 = pd.concat([df_blank['Quelle'], df_top3], axis=1)
        df_top3.to_excel(path_predict, index=False)

        print(df_top3)
        print("Successfully saved predictions under path: {}".format(
            path_predict))

    else:
        print("There aren't any blank KIT-tags in the report")
Ejemplo n.º 7
0
def main():
    print("Generating cooperation network of authors...")

    path_data_network = func.get_abspath_folder_lastquarter(
    ) + 'data_network_{}.xlsx'.format(func.get_last_2_quarters())
    df_result = pd.read_excel(path_data_network)
    # print(df_result)

    # get namelist
    path_namelist = func.get_abspath_researcher()
    namelist = pd.read_excel(path_namelist)
    firstname = namelist['First name']
    lastname = namelist['Last name']
    fullname_list = lastname + ', ' + firstname
    fullname_list = fullname_list.drop([0])
    # print(fullname_list)

    # edges
    dict_edges = {}
    for fullname in fullname_list:
        dict_temp = {}
        df_temp = df_result.loc[(df_result[fullname] == 1)]
        for fullname2 in fullname_list:
            if fullname2 != fullname:
                dict_temp[fullname2] = df_temp[fullname2].sum()
        dict_edges[fullname] = dict_temp
    # print(dict_edges)

    # nodes
    dict_nodes = {}
    for fullname in fullname_list:
        df_temp = df_result.loc[(df_result[fullname] == 1)]
        dict_nodes[fullname] = df_temp[fullname].sum()
    # print(dict_nodes)

    # network
    nw_author = nx.Graph()
    for node in dict_nodes.keys():  #nodes
        if dict_nodes[node] > 0:
            nw_author.add_node(node, size=dict_nodes[node])

    for edge in dict_edges.keys():
        for co_edge in dict_edges[edge].keys():
            if dict_edges[edge][co_edge] > 0:
                nw_author.add_edge(edge,
                                   co_edge,
                                   weight=dict_edges[edge][co_edge])

    # get positions
    pos_ = nx.spring_layout(nw_author)

    # make edge trace
    edge_trace = []
    text_trace = []
    for edge in nw_author.edges():
        if nw_author.edges()[edge]['weight'] > 0:
            char_1 = edge[0]
            char_2 = edge[1]
        x0, y0 = pos_[char_1]
        x1, y1 = pos_[char_2]
        text = char_2 + ': ' + str(nw_author.edges()[edge]['weight'])
        edge_trace_tmp = make_edge(
            [x0, x1, None], [y0, y1, None],
            text,
            width=nw_author.edges()[edge]['weight']**0.5)
        edge_trace.append(edge_trace_tmp)
    for edge in nw_author.edges():
        if nw_author.edges()[edge]['weight'] > 0:
            char_1 = edge[1]
            char_2 = edge[0]
        x0, y0 = pos_[char_1]
        x1, y1 = pos_[char_2]
        text = char_2 + ': ' + str(nw_author.edges()[edge]['weight'])
        edge_trace_tmp = make_edge(
            [x0, x1, None], [y0, y1, None],
            text,
            width=nw_author.edges()[edge]['weight']**0.5)
        edge_trace.append(edge_trace_tmp)

    # make node trace
    node_trace = go.Scatter(x=[],
                            y=[],
                            text=[],
                            textposition="top center",
                            textfont_size=10,
                            mode='markers+text',
                            hoverinfo='none',
                            marker=dict(color=[], size=[], line=None))
    for node in nw_author.nodes():
        x, y = pos_[node]
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])
        node_trace['marker']['color'] += tuple([color_nw])
        # node_trace['marker']['size'] += tuple([5*nw_author.nodes()[node]['size']])
        node_trace['text'] += tuple(['<b>' + node + '</b>'])

    # customize layout
    layout = go.Layout(paper_bgcolor='rgba(0,0,0,0)',
                       plot_bgcolor='rgba(0,0,0,0)',
                       hovermode='x')
    fig = go.Figure(layout=layout)
    for trace in edge_trace:
        fig.add_trace(trace)
    fig.add_trace(node_trace)
    fig.update_layout(showlegend=False)
    fig.update_xaxes(showticklabels=False)
    fig.update_yaxes(showticklabels=False)

    path_html = func.get_abspath_folder_lastquarter() + "network_author.html"
    fig.write_html(path_html)

    print("Successfully generated cooperation network under path: {}".format(
        path_html))