Example #1
0
def gen_model(remodel=True, rebuild=True, model='', num_topics=5, user_config=None):
    # * Load model
    if path.isfile(lda_src+model) and not (remodel or rebuild):
        pa_print.nprint('\nLoading bodies, dict, corpus, and model...')
        processed_bodies = pickle.load(open(f'{lda_src}bodies.pkl', 'rb'))
        dictionary = gensim.corpora.Dictionary.load(f'{lda_src}dictionary.gensim')
        corpus = pickle.load(open(f'{lda_src}corpus.pkl', 'rb'))
        lda_model = lda.load(f'{lda_src}{model}')

    else: # Build model afterwards
        # Load resources
        if path.isfile(f'{lda_src}dictionary.gensim') and path.isfile(f'{lda_src}corpus.pkl') and not rebuild:
            pa_print.nprint('\nLoading bodies, dict and corpus...')
            processed_bodies = pickle.load(open(f'{lda_src}bodies.pkl', 'rb'))
            dictionary = gensim.corpora.Dictionary.load(f'{lda_src}dictionary.gensim')
            corpus = pickle.load(open(f'{lda_src}corpus.pkl', 'rb'))
        else:
            # Remove old
            for doc in [f'{lda_src}bodies.pkl', f'{lda_src}dictionary.gensim', f'{lda_src}corpus.pkl']:
                try: os.remove(doc)
                except FileNotFoundError: pass

            # Build everything from text files
            pa_print.nprint('Building dict and corpus...')
            doc_list = []
            processed_bodies = []

            for text_fn in os.listdir(grobid_text_src):
                if text_fn.startswith('grob_'):
                    with open(grobid_text_src+text_fn, 'r') as doc:
                        doc_list.append(doc.read())

            for doc in doc_list:
                processed_words = clean_text(doc, user_config) # extract only meaningful words, user config!
                processed_bodies.append(processed_words)

            # Save processed bodies for coherence score
            pickle.dump(processed_bodies, open(f'{lda_src}bodies.pkl', 'wb'))

            # Make and save dict and corpus
            dictionary = corpora.Dictionary(processed_bodies)
            dictionary.filter_extremes(no_below=3) # remove those with counts fewer than 3
            dictionary.save(f'{lda_src}dictionary.gensim')

            corpus = [dictionary.doc2bow(doc) for doc in processed_bodies]
            pickle.dump(corpus, open(f'{lda_src}corpus.pkl', 'wb'))

        # Build LDA model - default settings
        if remodel or rebuild or not path.isfile(f'{lda_src}{model}'):
            pa_print.nprint('Building model...')
            alpha ='asymmetric'
            eta = 0.5
            lda_model = lda(corpus, num_topics=num_topics, id2word=dictionary,
                            random_state=100, passes=10, alpha=alpha, eta=eta, per_word_topics=True)
            date = datetime.now().strftime('%Y%m%d')
            lda_model.save(f'{lda_src}{date}-{num_topics}-{alpha}-{eta}.model')
            pa_print.nprint('Saved model!')
        else: lda_model = lda.load(f'{lda_src}{model}')

    return processed_bodies, dictionary, corpus, lda_model
Example #2
0
def gen_wordcloud(processed_data):
    from wordcloud import WordCloud

    for data in processed_data:
        words = [word for doc in data[1] for word in doc]
        counter = dict(collections.Counter(words))
        wc = WordCloud(width=1920, height=1444,
                        background_color="white", max_words=500
                        ).generate_from_frequencies(counter)
        plt.imshow(wc, interpolation='bilinear')
        plt.axis("off")
        plt.savefig(f'./output/wordcloud_{data[0]}.png', dpi=300)
    pa_print.nprint('Generated .png files in ./output!')
Example #3
0
def gen_topic_plots(corpus, lda_model, year_dict, year_list, year_start, year_end):
    year_counts = np.zeros(year_end-year_start)

    # Add topic distribution from each doc into buckets of years
    for i in range(len(corpus)):
        topics = lda_model.get_document_topics(corpus[i])
        for j in range(year_start, year_end):
            if year_list[i][0] == j:
                year_counts[j-year_start] += 1 # how many bodies in each year
                for k, year_top in enumerate(year_dict[j]):
                    for top in topics:
                        if str(year_top[0]) == str(top[0]):
                            year_top = list(year_top)
                            year_top[1] = float(year_top[1]) + float(top[1])
                            year_dict[j][k] = tuple(year_top)

    # Weight the topic values by numbers of papers published each year
    for key, val in year_dict.items():
        for index, j in enumerate(val):
            j = list(j)
            j[1] = float(j[1]) / year_counts[index]
            year_dict[key][index] = tuple(j)

    # Create empty dict of lists for year range (n topics each year)
    xvals = [ [] for _ in range(num_topics) ]
    yvals = [ [] for _ in range(num_topics) ]
    plt.figure(figsize=(20,10))

    for year, topics in year_dict.items():
        for topic in topics:
            xvals[topic[0]].append(int(year))
            yvals[topic[0]].append(topic[1])

    for i in range(num_topics):
        plt.scatter(xvals[i], yvals[i], label=f'Topic {i}')
        s = UnivariateSpline(xvals[i], yvals[i], s=.1)
        xs = np.linspace(year_start, year_end, 50)
        ys = s(xs)
        plt.plot(xs, ys, label=f'Spline for topic {i}')

    plt.legend()
    plt.ylim(bottom=0)
    plt.xticks(range(year_start, year_end))
    plt.xlabel('Year')
    plt.ylabel('Occurrence of Topic over Yearly Papers)')
    plt.title('Occurrence of Topics over Publication Year')
    plt.savefig('./output/topic_occurrence.png')

    pa_print.nprint('Generated diagram .png in ./output!')
Example #4
0
def gen_lda(lda_model, corpus, processed_bodies, dictionary):
    # Compute Perplexity
    pa_print.nprint(f'Perplexity: {lda_model.log_perplexity(corpus)}')  # a measure of how good the model is, lower the better

    # Compute Coherence Score
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_bodies, dictionary=dictionary, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    pa_print.nprint(f'Coherence Score: {coherence_lda}')

    # Show some visualization of the topics that gathered
    lda_display = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)
    pyLDAvis.save_html(lda_display, './output/lda.html')
    pa_print.nprint('Generated lda.html in ./output!')
Example #5
0
def gen_counts(processed_data, year_list):
    top_counts_dfs = {}
    alt_top_counts_dfs = {}
    unique_dfs = {}
    abs_unique_dfs = {}

    for data in processed_data:
        # * Most popular keywords for each year (100)
        yearly_bodies, top_counts = {}, {}

        for year, doc in zip(year_list, data[1]):
            year = year[0]
            try:
                yearly_bodies[year].extend(doc) # accum all words from each year's papers
            except:
                yearly_bodies[year] = []
                yearly_bodies[year].extend(doc)

        for year in yearly_bodies:
            counts = collections.Counter(yearly_bodies[year])
            top_counts[year] = counts.most_common(100) # take most common

        top_counts = collections.OrderedDict(sorted(top_counts.items()))

        # Two columns [year, ('term', count)] - for Google Sheets
        top_counts_df = pd.DataFrame([[i,j] for i in top_counts.keys() for j in dict(top_counts[i]).items()])
        top_counts_dfs[data[0]] = top_counts_df

        # Columns by years (20 columns)
        alt_top_counts_df = pd.DataFrame.from_dict(top_counts, orient='index')
        alt_top_counts_dfs[data[0]] = alt_top_counts_df

        # * Get unique counts by removing last years top 10 (looking backwards)
        unique_counts = {}
        old_top, old_years = [], []

        for i, year in enumerate(top_counts):
            cur_counts = dict(top_counts[year]) # keep a dict for counts
            # cur_words = list(cur_counts) # unpack keys into list

            # new dict, without past year
            old_years.append(year)

            # remove words from prior years
            for key in old_top:
                cur_counts.pop(key, None)

            unique_words = list(dict(cur_counts))[:5] # make list of top 5 words
            old_top.extend(unique_words) # add old top to del words

            unique_counts[year] = cur_counts.items() # reassign
            # pa_print.nprint(unique_words)

        unique_df = pd.DataFrame.from_dict(unique_counts, orient='index')
        unique_dfs[data[0]] = unique_df

        # * Get absolute unique terms per year (not in the top common words of all other years)
        # Similar process to above but looks both forward and backward
        abs_unique_counts = {}

        for i, year in enumerate(top_counts):
            cur_counts = dict(top_counts[year]) # keep a dict for counts
            cur_words = list(cur_counts) # unpack keys into list (for a set)

            # new dict, without current year
            later_counts = {x: top_counts[x] for x in top_counts if x != year}

            other_words = []
            for later_year in later_counts:
                later_words = list(dict(later_counts[later_year]))
                other_words.extend(later_words) # extend

            unique_words = set(cur_words) - set(other_words)
            del_words = set(cur_words) - set(unique_words)

            for key in del_words: # del words included other years common words
                cur_counts.pop(key)
            abs_unique_counts[year] = list(cur_counts.items())

        abs_unique_df = pd.DataFrame.from_dict(abs_unique_counts, orient='index')
        abs_unique_dfs[data[0]] = abs_unique_df

    with pd.ExcelWriter('./output/topics.xlsx') as writer:
        for name in ['bodies', 'titles']:
            top_counts_dfs[name].to_excel(writer, sheet_name=f'Top counts {name}', header=False)
            alt_top_counts_dfs[name].to_excel(writer, sheet_name=f'Alt top counts {name}', header=False)
            unique_dfs[name].to_excel(writer, sheet_name=f'Unique counts {name}', header=False)
            abs_unique_dfs[name].to_excel(writer, sheet_name=f'Absolute unique counts {name}', header=False)

        topic_row = pd.Series(data=lda_model.show_topics(num_words=10), name='Word constituents of topics')
        topics_df = pd.DataFrame.from_dict(year_dict, orient='index')
        topics_df = topics_df.append(topic_row, ignore_index=False)
        topics_df.to_excel(writer, sheet_name='Weighted topics')

    pa_print.nprint('\nGenerated topics.xlsx in ./output!')
Example #6
0
        os.makedirs(d, exist_ok=True)

    # Question for load dict, corpus, model for docs
    remodel, rebuild = True, True
    model = ''
    answer = int(input('\nWant to [1] remodel, [2] rebuild dictionary and corpus, [3] both, or [4] load model? (1,2,3,4): '))
    if answer == 1:
        rebuild = False
        num_topics = int(input('Number of topics?: '))
    elif answer == 2:
        remodel = False
    elif answer == 3:
        num_topics = int(input('Number of topics?: '))
    elif answer == 4:
        rebuild, remodel = False, False
        pa_print.nprint('\nWhich model?')
        models = [mod for mod in os.listdir(lda_src) if mod.endswith('.model')]
        for i, mod in enumerate(models):
            print(f'{i+1}: {mod}')
        answer = int(input('\nSelect an option: ')) - 1
        model = models[answer]
        num_topics = int(model.split('-')[1])

    # Create list to mark each text with year (will be linked to corpus values)
    year_list = []
    for i in os.listdir(grobid_text_src):
        if i.startswith('grob_'):
            name = i.split('grob_nime')[-1]
            year = name.split('_')[0]
            year_list.append((int(year), name))
Example #7
0
def stats_travel(bib_df, conf_df):

    pa_print.nprint('\nComputing travel statistics...')

    outtxt = ''

    trav_df = pd.DataFrame(index=bib_df.index,
                           columns=[
                               'year', 'distance', 'footprint', 'country',
                               'continent', 'gender'
                           ])
    for idx, pub in bib_df.iterrows():
        trav_df.loc[idx, 'year'] = pub['year']
        trav_df.loc[idx, 'distance'] = pub['author distances'][0]
        trav_df.loc[idx, 'footprint'] = pub['author footprints'][0]
        trav_df.loc[idx, 'country'] = pub['countries'][0]
        trav_df.loc[idx, 'continent'] = pub['continents'][0]
        trav_df.loc[idx, 'gender'] = pub['author genders 2'][0]

    trav_df = trav_df.convert_dtypes()

    total_distance = trav_df['distance'].sum()
    total_footprint = trav_df['footprint'].sum()
    average_distance = trav_df['distance'].mean()
    average_footprint = trav_df['footprint'].mean()

    total_distance_per_year = trav_df.groupby(['year'])['distance'].sum()
    total_footprint_per_year = trav_df.groupby(['year'])['footprint'].sum()
    average_distance_per_year = trav_df.groupby(['year'])['distance'].mean()
    average_footprint_per_year = trav_df.groupby(['year'])['footprint'].mean()

    average_distance_per_continent = trav_df.groupby(['continent'
                                                      ])['distance'].mean()
    average_footprint_per_continent = trav_df.groupby(['continent'
                                                       ])['footprint'].mean()

    average_distance_per_country = trav_df.groupby(
        ['country'])['distance'].mean().sort_values(ascending=False)
    average_footprint_per_country = trav_df.groupby(
        ['country'])['footprint'].mean().sort_values(ascending=False)

    average_distance_per_gender = trav_df.groupby(['gender'
                                                   ])['distance'].mean()
    average_footprint_per_gender = trav_df.groupby(['gender'
                                                    ])['footprint'].mean()

    participants_by_country = trav_df.groupby(
        ['country'])['footprint'].count().sort_values(ascending=False)
    participants_by_country_per_year = trav_df.groupby(
        ['year', 'country'])['footprint'].count()

    outtxt += '\nTotal distance %f' % total_distance
    outtxt += '\nTotal footprint %f' % total_footprint
    outtxt += '\nAverage distance per participant %f' % average_distance
    outtxt += '\nAverage footprint per participant %f' % average_footprint

    with pd.ExcelWriter('./output/travel.xlsx') as writer:
        total_distance_per_year.to_excel(writer,
                                         sheet_name='Total dist. per year',
                                         header=False)
        total_footprint_per_year.to_excel(writer,
                                          sheet_name='Total dist. per year',
                                          header=False)
        average_distance_per_year.to_excel(
            writer, sheet_name='Avg. dist. per part. per year', header=False)
        average_footprint_per_year.to_excel(
            writer, sheet_name='Avg. footp. per part. per year', header=False)
        average_distance_per_continent.to_excel(
            writer, sheet_name='Avg. dist. per part. by cont.', header=False)
        average_footprint_per_continent.to_excel(
            writer, sheet_name='Avg. footp. per part. by cont.', header=False)
        average_distance_per_country.to_excel(
            writer, sheet_name='Avg. dist. per part. by count.', header=False)
        average_footprint_per_country.to_excel(
            writer, sheet_name='Avg. footp. per part. by count.', header=False)
        average_distance_per_gender.to_excel(
            writer, sheet_name='Avg. dist. per part. by gender', header=False)
        average_footprint_per_gender.to_excel(
            writer, sheet_name='Avg. footp. per part. by gender', header=False)
        participants_by_country.to_excel(writer,
                                         sheet_name='Participants by count.',
                                         header=False)
        participants_by_country_per_year.to_excel(
            writer, sheet_name='Participants by count. per year', header=False)

    with open('./output/travel.txt', 'w') as text_file:
        text_file.write(outtxt)

    print('\nGenerated travel.txt and travel.xlsx in ./output!')
Example #8
0
def stats_affiliation(bib_df, conf_df):

    pa_print.nprint('\nComputing affiliation statistics...')

    outtxt = ''

    auth_df = pd.DataFrame(index=range(bib_df['author count'].sum()),
                           columns=[
                               'year', 'name', 'citations', 'institutions',
                               'country', 'continent'
                           ])
    mixed_df = pd.DataFrame(
        index=bib_df.index,
        columns=['year', 'institutions', 'country', 'continent'])
    j = 0
    for idx, pub in bib_df.iterrows():
        author_count = pub['author count']
        for i in range(author_count):
            auth_df.loc[j, 'year'] = pub['year']
            auth_df.loc[j, 'name'] = pub['author names'][i][0] + ' ' + pub[
                'author names'][i][1]
            auth_df.loc[j, 'citations'] = pub['citation count']
            auth_df.loc[j, 'institutions'] = pub['institutions'][i]
            auth_df.loc[j, 'country'] = pub['countries'][i]
            auth_df.loc[j, 'continent'] = pub['continents'][i]
            j = j + 1
        if len(Counter(pub['institutions']).keys()) > 1:
            mixed_df.loc[idx, 'institutions'] = True
        else:
            mixed_df.loc[idx, 'institutions'] = False
        if len(Counter(pub['countries']).keys()) > 1:
            mixed_df.loc[idx, 'country'] = True
        else:
            mixed_df.loc[idx, 'country'] = False
        if len(Counter(pub['continents']).keys()) > 1:
            mixed_df.loc[idx, 'continent'] = True
        else:
            mixed_df.loc[idx, 'continent'] = False
        mixed_df.loc[idx, 'year'] = pub['year']

    # when counting - 1 removes the N/A
    number_of_institutions = auth_df['institutions'].nunique() - 1
    number_of_countries = auth_df['country'].nunique() - 1
    number_of_continents = auth_df['continent'].nunique() - 1

    number_of_institutions_per_year = auth_df.groupby(
        ['year'])['institutions'].nunique() - 1
    number_of_countries_per_year = auth_df.groupby(['year'
                                                    ])['country'].nunique() - 1
    number_of_continents_per_year = auth_df.groupby(
        ['year'])['continent'].nunique() - 1

    top_institutions_by_authors = auth_df.groupby(
        ['institutions']).size().sort_values(ascending=False).head(40)
    countries_by_authors = auth_df.groupby(
        ['country']).size().sort_values(ascending=False)
    continents_by_authors = auth_df.groupby(
        ['continent']).size().sort_values(ascending=False)

    top_institutions_by_authorcitations = auth_df.groupby([
        'institutions'
    ])['citations'].sum().sort_values(ascending=False).head(40)
    countries_by_authorcitations = auth_df.groupby(
        ['country'])['citations'].sum().sort_values(ascending=False)
    continents_by_authorcitations = auth_df.groupby(
        ['continent'])['citations'].sum().sort_values(ascending=False)

    perc_mixed_institute_papers_fraction = 100 * mixed_df[
        mixed_df['institutions'] == True].shape[0] / mixed_df.shape[0]
    perc_mixed_country_papers_fraction = 100 * mixed_df[
        mixed_df['country'] == True].shape[0] / mixed_df.shape[0]
    perc_mixed_continent_papers_fraction = 100 * mixed_df[
        mixed_df['continent'] == True].shape[0] / mixed_df.shape[0]

    temp = mixed_df[mixed_df['institutions'] == True]
    perc_mixed_institute_papers_fraction_per_year = 100 * temp.groupby(
        ['year']).size() / mixed_df.groupby(['year']).size()
    temp = mixed_df[mixed_df['country'] == True]
    perc_mixed_country_papers_fraction_per_year = 100 * temp.groupby(
        ['year']).size() / mixed_df.groupby(['year']).size()
    temp = mixed_df[mixed_df['continent'] == True]
    perc_mixed_continent_papers_fraction_per_year = 100 * temp.groupby(
        ['year']).size() / mixed_df.groupby(['year']).size()

    top_institutions_by_year = auth_df.groupby(
        ['year'])['institutions'].value_counts()
    top_countries_by_year = auth_df.groupby(['year'])['country'].value_counts()
    top_continents_by_year = auth_df.groupby(['year'
                                              ])['continent'].value_counts()

    years = auth_df['year'].unique()
    perc_authors_diff_country_continent = pd.DataFrame(
        index=years,
        columns=[
            '%_same_country_as_conference', '%_same_continent_as_conference'
        ])
    for y in years:
        same = len(
            auth_df[(auth_df['year'] == y) & (auth_df['country'] == conf_df[
                conf_df['year'] == y]['country'].values[0])].index)
        tot = len(auth_df[(auth_df['year'] == y)].index)
        perc_authors_diff_country_continent.at[
            y, '%_same_country_as_conference'] = 100 * same / tot
        same = len(
            auth_df[(auth_df['year'] == y) & (auth_df['continent'] == conf_df[
                conf_df['year'] == y]['continent'].values[0])].index)
        tot = len(auth_df[(auth_df['year'] == y)].index)
        perc_authors_diff_country_continent.at[
            y, '%_same_continent_as_conference'] = 100 * same / tot

    outtxt += '\nNumber of institutions %d' % (number_of_institutions - 1)
    outtxt += '\nNumber of countries %d' % (number_of_countries - 1)
    outtxt += '\nNumber of continents %d' % (number_of_continents - 1)
    outtxt += '\nPercentage paper author different institute %f' % perc_mixed_institute_papers_fraction
    outtxt += '\nPercentage paper author different country %f' % perc_mixed_country_papers_fraction
    outtxt += '\nPercentage paper author different coutinent %f' % perc_mixed_continent_papers_fraction

    with pd.ExcelWriter('./output/affiliations.xlsx') as writer:
        number_of_institutions_per_year.to_excel(
            writer, sheet_name='Num. of auth. instit. per year', header=False)
        number_of_countries_per_year.to_excel(
            writer, sheet_name='Num. of auth. countr. per year', header=False)
        number_of_continents_per_year.to_excel(
            writer, sheet_name='Num. of auth. contin. per year', header=False)
        top_institutions_by_authors.to_excel(
            writer, sheet_name='Top instit. by num authors', header=False)
        countries_by_authors.to_excel(writer,
                                      sheet_name='Dist. count. by num authors',
                                      header=False)
        continents_by_authors.to_excel(
            writer, sheet_name='Dist. contin. by num authors', header=False)
        top_institutions_by_authorcitations.to_excel(
            writer, sheet_name='Top instit. by auth. cit.', header=False)
        countries_by_authorcitations.to_excel(
            writer, sheet_name='Dist. countr. by auth. cit.', header=False)
        continents_by_authorcitations.to_excel(
            writer, sheet_name='Dist. contin. by auth. cit.', header=False)
        perc_mixed_institute_papers_fraction_per_year.to_excel(
            writer, sheet_name='% paper mixed instit. per year', header=False)
        perc_mixed_country_papers_fraction_per_year.to_excel(
            writer, sheet_name='% paper mixed countr. per year', header=False)
        perc_mixed_continent_papers_fraction_per_year.to_excel(
            writer, sheet_name='% paper mixed contin. per year', header=False)
        perc_authors_diff_country_continent.to_excel(
            writer, sheet_name='% auth. from out conf. per year', header=True)
        top_institutions_by_year.to_excel(writer,
                                          sheet_name='Top instit. by year',
                                          header=False)
        top_countries_by_year.to_excel(writer,
                                       sheet_name='Top count. by year',
                                       header=False)
        top_continents_by_year.to_excel(writer,
                                        sheet_name='Top contin. by year',
                                        header=False)

    with open('./output/affiliations.txt', 'w') as text_file:
        text_file.write(outtxt)

    print('\nGenerated affiliations.txt and affiliations.xlsx in ./output!')
Example #9
0
def stats_authors(bib_df):

    pa_print.nprint('\nComputing authorship statistics...')

    outtxt = ''

    auth_df = pd.DataFrame(index=range(bib_df['author count'].sum()),
                           columns=[
                               'year', 'name', 'gender1', 'gender2',
                               'citations', 'first', 'mixed'
                           ])
    j = 0
    authfem_df = pd.DataFrame(index=bib_df.index, columns=['year', '1F'])
    for idx, pub in bib_df.iterrows():
        authfem_df.loc[idx, 'year'] = pub['year']
        author_count = pub['author count']
        flag = False
        for i in range(author_count):
            auth_df.loc[j, 'year'] = pub['year']
            auth_df.loc[j, 'name'] = pub['author names'][i][0] + ' ' + pub[
                'author names'][i][1]
            auth_df.loc[j, 'gender1'] = pub['author genders'][i]
            auth_df.loc[j, 'gender2'] = pub['author genders 2'][i]
            if pub['author genders 2'][i] == 'F':
                flag = True
            auth_df.loc[j, 'citations'] = pub['citation count']
            if i == 0:
                auth_df.loc[j, 'first'] = True
            else:
                auth_df.loc[j, 'first'] = False
            j = j + 1

        authfem_df.loc[idx, '1F'] = flag

    # author count and gender
    total_authors = bib_df['author count'].sum()
    total_male_authors = len(auth_df[auth_df['gender2'] == 'M'])
    total_female_authors = len(auth_df[auth_df['gender2'] == 'F'])
    total_neutral_authors = len(auth_df[auth_df['gender2'] == 'N'])

    temp = auth_df.drop_duplicates(subset=['name'])
    unique_authors = len(temp.index)
    unique_male_authors = len(temp[temp['gender2'] == 'M'])
    unique_female_authors = len(temp[temp['gender2'] == 'F'])
    unique_neutral_authors = len(temp[temp['gender2'] == 'N'])

    papers_by_numauthors = bib_df['author count'].value_counts(sort=False)

    average_authors = bib_df['author count'].mean()
    average_authors_per_year = bib_df.groupby(['year'])['author count'].mean()
    total_authors_per_year = bib_df.groupby(['year'])['author count'].sum()

    auth_df_unique = auth_df.drop_duplicates(subset=['name', 'year'])
    unique_authors_per_year = auth_df_unique.groupby(['year'
                                                      ])['name'].nunique()
    authors_by_editions = auth_df_unique['name'].value_counts(sort=True)
    authors_with_editions = authors_by_editions.value_counts(
        sort=False).sort_index()

    temp = auth_df[auth_df['gender2'] == 'M']
    total_male_authors_by_year = temp.groupby(['year']).size()
    temp = auth_df[auth_df['gender2'] == 'F']
    total_female_authors_by_year = temp.groupby(['year']).size()
    temp = auth_df[auth_df['gender2'] == 'N']
    total_neutral_authors_by_year = temp.groupby(['year']).size()
    total_male_percentage_by_year = (
        100 * total_male_authors_by_year /
        (total_male_authors_by_year + total_female_authors_by_year))

    temp = auth_df_unique[auth_df_unique['gender2'] == 'M']
    unique_male_authors_by_year = temp.groupby(['year']).size()
    temp = auth_df_unique[auth_df_unique['gender2'] == 'F']
    unique_female_authors_by_year = temp.groupby(['year']).size()
    temp = auth_df_unique[auth_df_unique['gender2'] == 'N']
    unique_neutral_authors_by_year = temp.groupby(['year']).size()
    unique_male_percentage_by_year = (
        100 * unique_male_authors_by_year /
        (unique_male_authors_by_year + unique_female_authors_by_year))

    papers_by_authors = auth_df['name'].value_counts(sort=True)
    authors_with_numpapers = papers_by_authors.value_counts(
        sort=False).sort_index()

    temp = auth_df_unique[auth_df_unique['first'] == True]
    papers_by_authors_first = temp['name'].value_counts(sort=True)
    authors_with_numpapers_first = papers_by_authors_first.value_counts(
        sort=False).sort_index()

    authors_by_citations = auth_df.groupby(
        ['name'])['citations'].sum().sort_values(ascending=False)
    authors_with_citations = authors_by_citations.value_counts(
        sort=False).sort_index(ascending=True)

    gender_by_citations = auth_df.groupby(['gender2'])['citations'].sum()
    gender_by_citations_per_year = auth_df.groupby(['gender2', 'year'
                                                    ])['citations'].sum()

    temp = authfem_df[authfem_df['1F'] == True]
    one_fem = len(temp)
    one_fem_per_year = 100 * temp.groupby(
        ['year']).size() / authfem_df.groupby(['year']).size()

    years = auth_df['year'].unique()
    auth_returning = pd.DataFrame(index=years)
    auth_returning['first_time'] = ''
    auth_returning['returning_other_years'] = ''
    auth_returning['returning_previous_year'] = ''
    auth_returning['total_unique'] = ''
    poolall = []
    poolprevious = []

    for y in years:
        if y == 2001:
            auth_returning.at[y, 'returning_previous_year'] = 0
            auth_returning.at[y, 'returning_other_years'] = 0
            auth_returning.at[y, 'first_time'] = auth_df[auth_df['year'] ==
                                                         y]['name'].nunique()
            auth_returning.at[y, 'total_unique'] = auth_df[
                auth_df['year'] == y]['name'].nunique()
            poolprevious = auth_df[auth_df['year'] == y]['name'].unique()
            poolall = poolprevious
        else:
            temp = auth_df[auth_df['year'] == y]['name'].unique()
            returning = np.intersect1d(temp, poolprevious)
            auth_returning.at[y, 'returning_previous_year'] = len(returning)
            returning = np.intersect1d(temp, poolall)
            auth_returning.at[y, 'returning_other_years'] = len(
                returning) - auth_returning.at[y, 'returning_previous_year']
            auth_returning.at[y, 'first_time'] = len(temp) - auth_returning.at[
                y, 'returning_previous_year'] - auth_returning.at[
                    y, 'returning_other_years']
            auth_returning.at[y, 'total_unique'] = auth_df[
                auth_df['year'] == y]['name'].nunique()
            poolprevious = auth_df[auth_df['year'] == y]['name'].unique()
            poolall = np.unique(np.append(poolall, temp))

    # lokta's law fitting
    xdata = np.array(authors_with_numpapers.index)
    ydata = np.array(authors_with_numpapers.values) / (np.array(
        authors_with_numpapers.values).sum())

    popt, pcov = curve_fit(lotka_law, xdata, ydata)
    residuals = ydata - lotka_law(xdata, *popt)
    ss_res = np.sum(residuals**2)
    ss_tot = np.sum((ydata - np.mean(ydata))**2)
    r_squared = 1 - (ss_res / ss_tot)
    #lotka_df = pd.DataFrame(data={'xdata': xdata, 'freq': ydata, 'fit': lotka_law(xdata, *popt)})

    outtxt += '\nTotal authors %d - males %d - females %d - unknown %d' % (
        total_authors, total_male_authors, total_female_authors,
        total_neutral_authors)
    outtxt += '\nUnique authors %d - males %d - females %d - unknown %d' % (
        unique_authors, unique_male_authors, unique_female_authors,
        unique_neutral_authors)
    outtxt += '\nPapers with at least one female author %d' % one_fem
    outtxt += '\nAverage authors per paper %f' % average_authors
    outtxt += '\nLokta' 's law fitting  n %f - C %f - R^2 %f' % (
        popt[0], popt[1], r_squared)

    with pd.ExcelWriter('./output/authors.xlsx') as writer:
        total_authors_per_year.to_excel(writer,
                                        sheet_name='Total authors per year',
                                        header=False)
        unique_authors_per_year.to_excel(writer,
                                         sheet_name='Unique authors per year',
                                         header=False)
        auth_returning.to_excel(writer,
                                sheet_name='Returning authors',
                                header=True)
        average_authors_per_year.to_excel(
            writer, sheet_name='Avg. auth. per paper per year', header=False)
        total_male_authors_by_year.to_excel(
            writer, sheet_name='Total male auth. per year', header=False)
        total_female_authors_by_year.to_excel(
            writer, sheet_name='Total female auth. per year', header=False)
        total_neutral_authors_by_year.to_excel(
            writer, sheet_name='Total unknown auth. per year', header=False)
        total_male_percentage_by_year.to_excel(
            writer, sheet_name='Total male auth. % per year', header=False)
        unique_male_authors_by_year.to_excel(
            writer, sheet_name='Unique male auth. per year', header=False)
        unique_female_authors_by_year.to_excel(
            writer, sheet_name='Unique female auth. per year', header=False)
        unique_neutral_authors_by_year.to_excel(
            writer, sheet_name='Unique unknown auth. per year', header=False)
        unique_male_percentage_by_year.to_excel(
            writer, sheet_name='Unique male % per year', header=False)
        papers_by_numauthors.to_excel(
            writer, sheet_name='Distr. papers by num authors', header=False)
        papers_by_authors.to_excel(writer,
                                   sheet_name='Papers by authors',
                                   header=False)
        authors_with_numpapers.to_excel(
            writer, sheet_name='Distr. authors with #papers', header=False)
        papers_by_authors_first.to_excel(writer,
                                         sheet_name='Papers by authors first',
                                         header=False)
        authors_with_numpapers_first.to_excel(
            writer, sheet_name='Authors first with #papers', header=False)
        authors_by_editions.to_excel(writer,
                                     sheet_name='Authors at #editions',
                                     header=False)
        authors_with_editions.to_excel(writer,
                                       sheet_name='Distr. auth. at #editions',
                                       header=False)
        authors_by_citations.to_excel(writer,
                                      sheet_name='Authors by citations',
                                      header=False)
        authors_with_citations.to_excel(
            writer, sheet_name='Distr. auth. with #citations', header=False)
        gender_by_citations.to_excel(writer,
                                     sheet_name='Cit. males-females',
                                     header=False)
        gender_by_citations_per_year.to_excel(
            writer, sheet_name='Cit. males-females per year', header=False)
        one_fem_per_year.to_excel(writer,
                                  sheet_name='Papers with >1 female per year',
                                  header=False)

    with open('./output/authors.txt', 'w') as text_file:
        text_file.write(outtxt)

    print('\nGenerated authors.txt and authors.xlsx in ./output!')
Example #10
0
def stats_papers(bib_df):

    pa_print.nprint('\nComputing papers statistics...')

    outtxt = ''
    # papers in total and per year
    papers_total = len(bib_df.index)
    papers_per_year = bib_df['year'].value_counts(sort=False)
    outtxt += '\nTotal papers %d' % papers_total

    # growth of NIME papers corpus per year
    papers_per_year_cumulative = bib_df['year'].value_counts(
        sort=False).cumsum()

    # full-short-other papers
    pre21_bib_df = bib_df.loc[(bib_df['year'] <= 2020)]
    post21_bib_df = bib_df.loc[(bib_df['year'] >= 2021)]

    temp = pre21_bib_df.loc[(pre21_bib_df['page count'] > 4)]
    full_papers_per_year_pre21 = temp['year'].value_counts(sort=False)
    temp = post21_bib_df.loc[(post21_bib_df['word count'] > 3000)]
    full_papers_per_year_post21 = temp['year'].value_counts(sort=False)
    full_papers_per_year = pd.concat(
        [full_papers_per_year_pre21, full_papers_per_year_post21], axis=0)
    full_papers_total = full_papers_per_year.sum()

    temp = pre21_bib_df.loc[(pre21_bib_df['page count'] > 2)
                            & (pre21_bib_df['page count'] <= 4)]
    short_papers_per_year_pre21 = temp['year'].value_counts(sort=False)
    temp = post21_bib_df.loc[(post21_bib_df['word count'] > 1500)
                             & (post21_bib_df['word count'] <= 3000)]
    short_papers_per_year_post21 = temp['year'].value_counts(sort=False)
    short_papers_per_year = pd.concat(
        [short_papers_per_year_pre21, short_papers_per_year_post21], axis=0)
    short_papers_total = short_papers_per_year.sum()

    temp = pre21_bib_df.loc[(pre21_bib_df['page count'] <= 2)]
    other_papers_per_year_pre21 = temp['year'].value_counts(sort=False)
    temp = post21_bib_df.loc[(post21_bib_df['word count'] <= 1500)]
    other_papers_per_year_post21 = temp['year'].value_counts(sort=False)
    other_papers_per_year = pd.concat(
        [other_papers_per_year_pre21, other_papers_per_year_post21], axis=0)
    other_papers_total = other_papers_per_year.sum()

    outtxt += '\nTotal Full Papers %d' % full_papers_total
    outtxt += '\nTotal short papers %d' % short_papers_total
    outtxt += '\nTotal Other Papers %d' % other_papers_total

    # pages
    papers_by_pages_pre21 = pre21_bib_df['page count'].value_counts(sort=False)
    average_paper_length_pages_pre21 = pre21_bib_df['page count'].mean()
    max_paper_length_pages_pre21 = pre21_bib_df['page count'].max()
    pages_per_year_average_pre21 = pre21_bib_df.groupby(
        ['year'])['page count'].mean()
    pages_per_year_total_pre21 = pre21_bib_df.groupby(['year'
                                                       ])['page count'].sum()
    longest_papers_pages_pre21 = pre21_bib_df.loc[
        bib_df['page count'] == max_paper_length_pages_pre21]['title']
    outtxt += '\nAverage papers length pages pre 2021 %f' % average_paper_length_pages_pre21
    outtxt += '\nMax papers length pages pre 2021 %d' % max_paper_length_pages_pre21

    # word count
    words_total = bib_df['word count'].sum()
    words_average = bib_df['word count'].mean()

    pre20 = pre21_bib_df.loc[(pre21_bib_df['page count'] > 4)]
    post21 = post21_bib_df.loc[(post21_bib_df['word count'] > 3000)]
    temp = pd.concat([pre20, post21], axis=0)
    words_average_full = temp['word count'].mean()

    pre20 = pre21_bib_df.loc[(pre21_bib_df['page count'] > 2)
                             & (pre21_bib_df['page count'] <= 4)]
    post21 = post21_bib_df.loc[(post21_bib_df['word count'] > 1500)
                               & (post21_bib_df['word count'] <= 3000)]
    temp = pd.concat([pre20, post21], axis=0)
    words_average_short = temp['word count'].mean()

    pre20 = pre21_bib_df.loc[(pre21_bib_df['page count'] <= 2)]
    post21 = post21_bib_df.loc[(post21_bib_df['word count'] <= 1500)]
    temp = pd.concat([pre20, post21], axis=0)
    words_average_other = temp['word count'].mean()

    temp = pre21_bib_df.loc[pre21_bib_df['page count'] == 6]
    words_average_sixpages_pre20 = temp['word count'].mean()

    temp = pre21_bib_df.loc[pre21_bib_df['page count'] == 4]
    words_average_fourpages_pre20 = temp['word count'].mean()

    temp = pre21_bib_df.loc[pre21_bib_df['page count'] == 2]
    words_average_twopages_pre20 = temp['word count'].mean()

    words_per_year_total = bib_df.groupby(['year'])['word count'].sum()
    words_per_year_average = bib_df.groupby(['year'])['word count'].mean()

    max_paper_words = bib_df['word count'].max()
    longest_papers_words = bib_df.loc[bib_df['word count'] ==
                                      max_paper_words]['title']

    counts, bins = np.histogram(bib_df['word count'], bins=50)
    center = (bins[:-1] + bins[1:]) / 2
    papers_by_word_count = pd.DataFrame(counts,
                                        index=center,
                                        columns=['count'])

    outtxt += '\nTotal word count %d' % words_total
    outtxt += '\nAverage word count %f' % words_average
    outtxt += '\nAverage word count full papers %f' % words_average_full
    outtxt += '\nAverage word count short papers %f' % words_average_short
    outtxt += '\nAverage word count other papers %f' % words_average_other
    outtxt += '\nAverage word count 6 pages pre 2021 %f' % words_average_sixpages_pre20
    outtxt += '\nAverage word count 4 pages pre 2021  %f' % words_average_fourpages_pre20
    outtxt += '\nAverage word count 2 pages pre 2021  %f' % words_average_twopages_pre20
    outtxt += '\nMax papers words %d' % max_paper_words

    # citations
    papers_by_citations = bib_df['citation count'].value_counts(
        sort=False).sort_index()
    citations_total = bib_df['citation count'].sum()
    citations_per_year = bib_df.groupby(['year'])['citation count'].sum()
    citations_per_year_norm_by_numpaper = bib_df.groupby(
        ['year'])['citation count'].mean()
    citations_per_year_norm_by_agepapers = bib_df.groupby(
        ['year'])['yearly citations'].mean()

    temp = bib_df.loc[bib_df['citation count'] >= 1]
    papers_at_least_1_citation = len(temp.index)

    temp = bib_df.loc[bib_df['citation count'] >= 10]
    papers_more_10_citations = len(temp.index)

    citations_50perc = papers_perc_citations(bib_df, 0.5)
    citations_90perc = papers_perc_citations(bib_df, 0.9)

    citations_50perc_per_year = papers_perc_citations_year(bib_df, 0.5)
    citations_90perc_per_year = papers_perc_citations_year(bib_df, 0.9)

    temp = bib_df.sort_values(by=['citation count'], ascending=False)
    temp = temp.head(20)
    top_papers_by_citations = temp[[
        'citation count', 'title', 'year', 'NIME reader'
    ]]

    temp = bib_df.sort_values(by=['yearly citations'], ascending=False)
    temp = temp.head(20)
    top_papers_by_yearly_citations = temp[[
        'yearly citations', 'title', 'year', 'NIME reader'
    ]]

    most_cited_paper_by_pub_year = papers_top_citations_year(bib_df)

    temp = bib_df.loc[bib_df['citation count'].isnull()]
    not_cited_pages = temp['page count'].value_counts(sort=True)

    outtxt += '\nTotal citations %d' % citations_total
    outtxt += '\nPapers with at least 1 citation %d equivaent to %f %%' % (
        papers_at_least_1_citation,
        100 * papers_at_least_1_citation / papers_total)
    outtxt += '\nPapers with 10 or more citations %d equivalent to %f %%' % (
        papers_more_10_citations,
        100 * papers_more_10_citations / papers_total)
    outtxt += '\n50%% citations are from %d papers representing %f %% of the total' % (
        citations_50perc[0], 100 * citations_50perc[1])
    outtxt += '\n90%% citations are from %d papers representing %f %% of the total' % (
        citations_90perc[0], 100 * citations_90perc[1])

    with pd.ExcelWriter('./output/papers.xlsx') as writer:
        papers_per_year.to_excel(writer,
                                 sheet_name='Papers per year',
                                 header=False)
        papers_per_year_cumulative.to_excel(
            writer, sheet_name='Cumulative papers per year', header=False)
        full_papers_per_year.to_excel(writer,
                                      sheet_name='Full papers per year',
                                      header=False)
        short_papers_per_year.to_excel(writer,
                                       sheet_name='Short papers per year',
                                       header=False)
        other_papers_per_year.to_excel(writer,
                                       sheet_name='Other papers per year',
                                       header=False)
        longest_papers_pages_pre21.to_excel(
            writer, sheet_name='Longest papers in pages pre 21', header=False)
        pages_per_year_total_pre21.to_excel(
            writer, sheet_name='Pages total per year pre 21', header=False)
        pages_per_year_average_pre21.to_excel(
            writer, sheet_name='Pages average per year pre 21', header=False)
        papers_by_pages_pre21.to_excel(
            writer, sheet_name='Papers by page count pre 21', header=False)
        longest_papers_words.to_excel(writer,
                                      sheet_name='Longest papers in words',
                                      header=False)
        words_per_year_total.to_excel(writer,
                                      sheet_name='Words total per year',
                                      header=False)
        words_per_year_average.to_excel(writer,
                                        sheet_name='Words average per year',
                                        header=False)
        papers_by_word_count.to_excel(writer,
                                      sheet_name='Papers by word count',
                                      header=False)
        citations_per_year.to_excel(writer,
                                    sheet_name='Cit. per year',
                                    header=False)
        citations_per_year_norm_by_numpaper.to_excel(
            writer, sheet_name='Cit. pr yr. norm.by #papers', header=False)
        citations_per_year_norm_by_agepapers.to_excel(
            writer, sheet_name='Cit. pr yr. norm.by #papers&age', header=False)
        citations_50perc_per_year.to_excel(
            writer, sheet_name='50% cit. from papers per year', header=True)
        citations_90perc_per_year.to_excel(
            writer, sheet_name='90% cit. from papers per year', header=True)
        top_papers_by_citations.to_excel(writer,
                                         sheet_name='Top papers by cit.',
                                         header=True)
        top_papers_by_yearly_citations.to_excel(
            writer, sheet_name='Top papers by yearly cit.', header=True)
        most_cited_paper_by_pub_year.to_excel(
            writer, sheet_name='Most cited paper by pub. year', header=True)
        papers_by_citations.to_excel(writer,
                                     sheet_name='Papers by cit.',
                                     header=False)
        not_cited_pages.to_excel(writer,
                                 sheet_name='Not cited papers by page length',
                                 header=False)

    with open('./output/papers.txt', 'w') as text_file:
        text_file.write(outtxt)

    print('\nGenerated papers.txt and papers.xlsx in ./output!')