コード例 #1
0
ファイル: Main.py プロジェクト: korotaevaVika/ruKeywords
def ru_words_test():
    lang = 'russian'

    if lang == 'russian':
        reader = Reader('elsevier journal.pdf',
                        generateWordList=True)  #, language='russian')
        reader.loadFile()

        words, lines, articles = reader.parseDocPages(
            19, 21)  #(14, 17)#0, 50)#()#2160, 2190)#0, 50)

        for i in range(11):
            print(i)
            if i != 6:
                print(' '.join(
                    [x['text'] for x in words if x['block_count'] == i]))
            else:
                for j in range(15):
                    print(j, ' sentence')
                    print(' '.join([
                        x['text'] for x in words
                        if x['block_count'] == i and x['sentence_count'] == j
                    ]))

        print("--=-")
    else:
        df_source = loadArticlesInEnglish()
コード例 #2
0
def list_w_p(path, mass):
    '''Return a list of work-power datas caculated from\
       the .txt files within the path.\
       Argument:\
       mass: mass or surface area or volume of the sample'''

    instances = []

    for s in os.listdir(path):
        # find the .txt files
        if '.txt' in s:
            f_path = os.path.join(path, s)
            inst = rd.GV(filename=f_path, mass=mass)
            instances.append(inst)

    # the list for line objects in an axe
    w_p = pd.DataFrame()
    for counter, inst in enumerate(instances):

        w_p = w_p.append(inst.work_power())

    return w_p.reset_index(drop=True)
コード例 #3
0
ファイル: Main.py プロジェクト: korotaevaVika/ruKeywords
def lemmaEng():
    lang = 'english'
    #lang = 'russian'
    partMatchesCounted = False  #if true then custom, else rough precision recall
    num = 8

    if lang == 'russian':
        reader = Reader('elsevier journal.pdf')  #, language='russian')
        reader.loadFile()

        words, lines, articles = reader.parseDocPages()  #2160, 2190)#0, 50)
        print(len(articles))

        df = reader.getArticlesDataframe()

    else:
        df = loadArticlesInEnglish()

    df['text'] = ''

    processor = Preprocessor(stopwordsList=None, lang=lang)

    if (lang == 'russian'):
        df['text'] = df['Source Text']
        #for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        #	df.at[index, 'text'] = processor.preprocess_text(row['Source Text'],
        #	useLemmas = False)
    else:
        df['text'] = df['Source Text']
        subset = df.iloc[0:1, :]

        for index, row in tqdm(subset.iterrows(), total=subset.shape[0]):
            text_lemma_sw = processor.preprocess_text(row['Source Text'],
                                                      removeStopWords=True,
                                                      useLemmas=True)
            text_lemma = processor.preprocess_text(row['Source Text'],
                                                   removeStopWords=False,
                                                   useLemmas=True)
            subset.at[index, 'text_lemma_sw'] = text_lemma_sw
            subset.at[
                index,
                'text_lemma'] = text_lemma  #processor.preprocess_text(row['Source Text'], removeStopWords=False,
            #useLemmas=True)
            source = row['Source Text']
コード例 #4
0
 def object_generator(path, mass):
     for s in os.listdir(path):
         # find the .txt files
         if '.txt' in s:
             f_path = os.path.join(path, s)
             yield rd.CV(filename=f_path, mass=mass)
コード例 #5
0
def multi_cv(
        ax,
        path,
        title=None,
        mass=1.0,
        color_set=['black', 'blue', 'green', 'red', 'pink', 'brown', 'yellow'],
        xlabel=None,
        ylabel=None,
        filter_on=False,
        smooth_range=None,
        legend_font=12,
        **kwargs):
    '''Draw multiple cv curves on a single axe\
       Arguments:\
       ax\
       path: the path of the file containing .txt files\
       title\
       mass: mass or surface area or volume of the sample'''

    gp.Update_axe(ax, title=title, **kwargs)
    # the list for Dataframe objects
    cv = []
    for s in os.listdir(path):
        # find the .txt files
        if '.txt' in s:
            f_path = os.path.join(path, s)
            obj = rd.CV(filename=f_path, mass=mass)

            cv.append(obj)
    # the list for line objects in an axe

    for counter, obj in enumerate(cv):
        #  if filter is on
        if filter_on == True and smooth_range is not None:
            gp.MultiLine(smoother_cv(obj, smooth_range),
                         ax,
                         label=obj.scan_rate,
                         color=color_set[counter])

        # or not...
        else:
            gp.MultiLine(obj,
                         ax,
                         label=obj.scan_rate,
                         color=color_set[counter])

    if xlabel == None:

        ax.set_xlabel(cv[0].columns[0])

    else:
        ax.set_xlabel(xlabel)

    if ylabel == None:
        ax.set_ylabel(cv[0].columns[1])

    else:
        ax.set_ylabel(ylabel)

    gp.Auto_legend(ax,
                   bbox_to_anchor=(0, 1.03),
                   loc='upper left',
                   fontsize=legend_font)
コード例 #6
0
def multi_c_d(
        ax,
        path,
        title=None,
        mass=1.0,
        unit=' A/g',
        color_set=['black', 'blue', 'green', 'red', 'pink', 'brown', 'yellow'],
        xlabel=None,
        ylabel=None,
        legend_font=12,
        b_to_a=(1, 0.75),
        **kwargs):
    '''Draw multiple charge/discharge curves on a single axe\
       Arguments:\
       ax\
       path: the path of the file containing .txt files\
       title\
       mass: mass or surface area or volume of the sample'''

    gp.Update_axe(ax, title=title, **kwargs)

    # the list for GV objects
    instances = []

    for s in os.listdir(path):
        # find the .txt files
        if '.txt' in s:
            f_path = os.path.join(path, s)
            inst = rd.GV(filename=f_path, mass=mass)
            instances.append(inst)

    # the list for line objects in an axe
    handles = []

    for counter, inst in enumerate(instances):

        l_char = gp.MultiLine(inst.charge_curve(),
                              ax,
                              label=str(inst.current) + unit,
                              color=color_set[counter])

        gp.MultiLine(inst.discharge_curve(),
                     ax,
                     label=str(inst.current) + unit,
                     color=color_set[counter])
        # line object is a list
        handles += l_char

    if xlabel == None:

        ax.set_xlabel(instances[0].columns[0])

    else:
        ax.set_xlabel(xlabel)

    if ylabel == None:
        ax.set_ylabel(instances[0].columns[1])

    else:
        ax.set_ylabel(ylabel)

    ax.set_xlim(left=0)
    ax.set_ylim(top=0.8, bottom=0)

    ax.legend(handles=handles,
              bbox_to_anchor=b_to_a,
              loc='upper right',
              frameon=False,
              fontsize=legend_font)
コード例 #7
0
ファイル: Main.py プロジェクト: korotaevaVika/ruKeywords
def test_ru_graph_keys_words_numbers():
    lang = 'russian'
    reader = Reader('elsevier journal.pdf')  #, language='russian')
    reader.loadFile()

    words, lines, articles = reader.parseDocPages(
    )  #0, 50)#()#2160, 2190)#0, 50)
    print(len(articles))

    df_source = reader.getArticlesDataframe()
    df = df_source.copy()

    df['text'] = ''
    df['noun_phrases_num'] = 0
    processor = Preprocessor(stopwordsList=None, lang=lang)
    sw = processor.stopwords

    morph = pymorphy2.MorphAnalyzer()
    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        text = processor.preprocess_text(row['Source Text'],
                                         removeStopWords=True,
                                         useLemmas=False)
        df.at[index, 'text'] = text
        pos = [morph.parse(w)[0].tag.POS for w in re.findall(r"[\w']+", text)]
        count = Counter(pos)
        df.at[
            index,
            'noun_phrases_num'] = count['NOUN'] + count['ADJF'] + count['PRTF']

    df['keys_phrases_num'] = df.apply(
        lambda row: len(row['Keywords'].split(',')), axis=1)
    df['keys_words_num'] = df.apply(lambda row: len(
        (re.findall(r"[\w']+", row['Keywords']))),
                                    axis=1)
    df['words_num_sw_incl'] = df.apply(lambda row: len(
        (re.findall(r"[\w']+", row['Source Text']))),
                                       axis=1)
    df['words_num'] = df.apply(lambda row: len(
        (re.findall(r"[\w']+", row['text']))),
                               axis=1)

    stats = df[[
        'keys_phrases_num', 'keys_words_num', 'words_num', 'words_num_sw_incl',
        'noun_phrases_num'
    ]]
    stats.to_excel("noun_phrases_num.xlsx")
    #stats.to_excel("keys_words_number_stats.xlsx")

    lst_phrases = stats['keys_phrases_num'].tolist()
    lst_keys = stats['keys_words_num'].tolist()
    lst_words = stats['words_num'].tolist()
    lst_words_num_sw_incl = stats['words_num_sw_incl'].tolist()
    lst_noun_phrases_num = stats['noun_phrases_num'].tolist()

    x = list(range(1, len(lst_words) + 1))

    plt.figure()
    plt.plot(x, lst_phrases)
    # Show/save figure as desired.
    title = 'Количество ключевых фраз'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество ключевых фраз')
    plt.savefig(title + " lst_phrases" + ".png", bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(x, lst_keys)
    # Show/save figure as desired.
    title = 'Количество ключевых слов'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество ключевых слов')
    plt.savefig(title + " lst_keys" + ".png", bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(x, lst_words_num_sw_incl)
    # Show/save figure as desired.
    title = 'Количество слов в текстах'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество слов в тексте статьи')
    plt.savefig(title + " words_num_sw_incl" + ".png", bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(x, lst_words)
    # Show/save figure as desired.
    title = 'Количество слов в текстах (без стопслов)'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество слов в тексте статьи')
    plt.savefig(title + " lst_words" + ".png", bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(x, lst_noun_phrases_num)
    # Show/save figure as desired.
    title = 'Количество существительных, прилагательных и причастий в текстах'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество слов')
    plt.savefig(title + " lst_noun_phrases_num" + ".png", bbox_inches='tight')
    plt.show()

    plt.figure()
    plt.plot(x, lst_words_num_sw_incl, 'b', label='С учетом стопслов')
    plt.plot(x, lst_words, 'g', label='Без учета стопслов')
    plt.plot(x,
             lst_noun_phrases_num,
             'y',
             label='Существительные, прилагательные и причастия')
    # Show/save figure as desired.
    title = 'Сравнение количества слов в текстах'
    plt.title(title)
    #  Добавляем подписи к осям:
    plt.xlabel("Номер статьи")
    plt.ylabel('Количество слов')
    plt.savefig(title + " compare_words_num_on_sw_pos" + ".png",
                bbox_inches='tight')
    plt.show()
コード例 #8
0
ファイル: Main.py プロジェクト: korotaevaVika/ruKeywords
def SimpleNN():
    warnings.filterwarnings("ignore")

    lang = 'russian'

    nltk.download("stopwords")
    lst_stopwords = stopwords.words("russian")
    alphabet = "АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя"
    alphabet += "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    lst_stopwords.extend(list(alphabet))

    if lang == 'russian':
        reader = Reader(
            'elsevier journal.pdf',
            generateWordList=True,
            additional_stopwords=lst_stopwords)  #, language='russian')
        reader.loadFile()

        words, lines, articles = reader.parseDocPages(4, 2190)  #19, 400) #1)

        df = pd.DataFrame.from_dict(words)

        cat_features = [
            'size',
            'flags',
            'font',  #'color', #'otherSigns',
            'morph_pos',
            'morph_animacy',
            'morph_aspect',
            'morph_case',
            'morph_gender',
            'morph_involvement',
            'morph_mood',
            'morph_number',
            'morph_person',
            'morph_tense',
            'morph_transitivity',
            'morph_voice'
        ]

        all_columns = list(df.columns)
        df = pd.concat([
            pd.get_dummies(df[col], prefix=col)
            if col in cat_features and col != 'otherSigns' else df[col]
            for col in all_columns
        ],
                       axis=1)

        values = [',', '.', '\)', '\(', '\[', '\]']

        for value in values:
            df[str('otherSigns' + '_' + value)] = np.where(
                df['otherSigns'].str.contains(value), "1", "0")
        df = df.drop(['otherSigns'], axis=1)

        print('all columns')
        for col in all_columns:
            print(col)
        print()

        print('new columns')
        for col in df.columns:
            print(col)

        df.head()

        featuresToRemove = [
            'text',
            'morph_normalform',
            'morph_lexeme',
            'span_count',
            'line_count',
            'block_count',
            #'sentence_count',
            'page_num',
            'article_num',
            'color'
        ]
        #featuresToRemove = []
        df = df.drop(featuresToRemove, axis=1)

        # convert all columns of DataFrame
        #df = df.apply(pd.to_numeric)

        print("df.dtypes")
        print(df.dtypes)

        df.to_csv('all_words_features.csv', index=False)

        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        newdf = df.select_dtypes(include=numerics)

        print("newdf.columns")
        print(newdf.columns)

        #df_y = newdf.pop('is_stopword')

        keywords = newdf[newdf['is_keyword'] == 1]
        print('len of keywords')
        print(keywords.shape)

        keywords.to_csv('all_keywords_features.csv', index=False)

        try:
            notkeywords = newdf[newdf['is_keyword'] == 0].sample(
                n=len(keywords), replace=False)
        except ValueError:
            notkeywords = newdf[newdf['is_keyword'] == 0].sample(
                n=len(keywords), replace=True)
        print('len of notkeywords')
        print(notkeywords.shape)

        notkeywords.to_csv('all_notkeywords_features.csv', index=False)

        newdf = pd.concat([notkeywords, keywords])
        newdf = newdf.sample(frac=1)

        df_Y = newdf.pop('is_keyword')
        Y = df_Y.values
        X = newdf.values.astype(float)
        print('X.shape')
        print(X.shape)

        # evaluate model with standardized dataset
        estimator = KerasClassifier(build_fn=create_baseline,
                                    input_dim=len(newdf.columns),
                                    epochs=100,
                                    batch_size=5,
                                    verbose=0)
        kfold = StratifiedKFold(n_splits=10, shuffle=True)
        results = cross_val_score(estimator, X, Y, cv=kfold, scoring='f1')
        print("Baseline: %.2f%% (%.2f%%)" %
              (results.mean() * 100, results.std() * 100))
        #Baseline: 86.09% (4.30%)
        #Baseline: 87.93% (1.14%)
        #Baseline: 71.70% (3.28%)
        #Baseline: 74.48% (1.10%) а1
        print('bye-bye')
コード例 #9
0
ファイル: Main.py プロジェクト: korotaevaVika/ruKeywords
def main():
    #lang = 'english'
    lang = 'russian'
    partMatchesCounted = False  #if true then custom, else rough precision recall
    num = 8

    if lang == 'russian':
        reader = Reader('elsevier journal.pdf')  #, language='russian')
        reader.loadFile()
        #pages = reader.readPages();

        ##pages = reader.readPages(0, None, 'НАПРАВЛЕНИЕ 1', 'НАПРАВЛЕНИЕ 2',
        ##caseSensitiveSearch=True)
        ##pages = reader.readPages(0, None, 'НАПРАВЛЕНИЕ 1', 'НАПРАВЛЕНИЕ 2',
        ##caseSensitiveSearch=False)
        ##pages = reader.readPages(start=2, end=10, startString=None, endString=None,
        ##debug=True)
        ##pages = reader.readPages(start=4, end=4, startString=None, endString=None,
        ##debug=True)
        #print(len(pages))

        words, lines, articles = reader.parseDocPages()  #2160, 2190)#0, 50)
        print(len(articles))

        df = reader.getArticlesDataframe()

    else:
        df = loadArticlesInEnglish()

    df['text'] = ''

    processor = Preprocessor(stopwordsList=None, lang=lang)

    if (lang == 'russian'):
        df['text'] = df['Source Text']
        #for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        #	df.at[index, 'text'] = processor.preprocess_text(row['Source Text'],
        #	useLemmas = False)
    else:
        df['text'] = df['Source Text']
        #for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        #df.at[index, 'text'] = processor.preprocess_text(row['Source Text'],
        #removeStopWords=False, useLemmas=False)

    print(df.head())
    rakeExtractor.extractKeywords(df,
                                  num=num,
                                  metricsCount=True,
                                  partMatchesCounted=partMatchesCounted)

    print(df.head())
    #print(tabulate(df, headers='keys', tablefmt='psql'))
    kw = df['Keywords'].values
    #print(kw)
    #print(df.Keywords)

    tfidfBlobExtractor = TfIdfBlobExtractor(processor.getStopwords())
    tfidfBlobExtractor.extractKeywords(df,
                                       num=num,
                                       metricsCount=True,
                                       partMatchesCounted=partMatchesCounted)

    #print(df.head())

    textRankExtractor = TextRankExtractor(processor.getStopwords(),
                                          language=lang)
    textRankExtractor.extractKeywords(
        df, num=num, metricsCount=True,
        partMatchesCounted=partMatchesCounted)  #False)

    #print(df.head())

    #print(tabulate(df.head(), headers='keys', tablefmt='psql'))

    x = []
    y = {
        'rake': {
            'precision': [],
            'recall': [],
            'f1': []
        },
        'textrank': {
            'precision': [],
            'recall': [],
            'f1': []
        },
        'tfidf': {
            'precision': [],
            'recall': [],
            'f1': []
        }
    }

    for index, row in tqdm(df.iterrows(), total=df.shape[0]):
        x.append(index)

        values = row['textrank_metrics'].split(',')
        y['textrank']['precision'].append(values[0])
        y['textrank']['recall'].append(values[1])
        y['textrank']['f1'].append(values[2])

        values = row['tfidf_blob_metrics'].split(',')
        y['tfidf']['precision'].append(values[0])
        y['tfidf']['recall'].append(values[1])
        y['tfidf']['f1'].append(values[2])

        values = row['rake_metrics'].split(',')
        y['rake']['precision'].append(values[0])
        y['rake']['recall'].append(values[1])
        y['rake']['f1'].append(values[2])
    #plt.plot(x, y['textrank']['precision'], 'g^', x, y['textrank']['recall'],
    #'g-')

    #fig, ax = plt.subplots()

    #bar_values = [statistics.mean(list(map(float, y['textrank']['precision']))),
    #			statistics.mean(list(map(float, y['rake']['precision']))),
    #			statistics.mean(list(map(float, y['tfidf']['precision'])))]
    #bar_label = bar_values

    #bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values)

    #autolabel(ax, bar_plot, bar_label)
    #plt.ylim(0,max(bar_label) * 1.5)
    #plt.title('Quality metrics for ' + lang + ' language')
    #plt.savefig("add_text_bar_matplotlib_01.png", bbox_inches='tight')
    #plt.show()

    metrics = ['precision', 'recall', 'f1']
    for i in range(len(metrics)):
        fig, ax = plt.subplots()
        bar_values = [
            statistics.mean(list(map(float, y['textrank'][metrics[i]]))),
            statistics.mean(list(map(float, y['rake'][metrics[i]]))),
            statistics.mean(list(map(float, y['tfidf'][metrics[i]])))
        ]
        bar_label = bar_values
        bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values)
        autolabel(ax, bar_plot, bar_label)
        plt.ylim(0, max(max(bar_label) * 1.5, 0.01))
        title = 'Metric ' + str(metrics[i] + ' for ' + str(num) +
                                ' found keywords based on data in ' + lang +
                                ' (partMatchesCounted = ' +
                                str(partMatchesCounted) + ')')
        plt.title(title)  #'Quality metrics for ' + lang + ' language')
        plt.savefig(title + ".png", bbox_inches='tight')

    plt.show()
コード例 #10
0
ファイル: Main.py プロジェクト: korotaevaVika/ruKeywords
def lemmaRuWithQuality():
    lang = 'russian'

    if lang == 'russian':
        reader = Reader('elsevier journal.pdf')  #, language='russian')
        reader.loadFile()

        words, lines, articles = reader.parseDocPages(
        )  #0, 50)#()#2160, 2190)#0, 50)
        print(len(articles))

        df_source = reader.getArticlesDataframe()

    else:
        df_source = loadArticlesInEnglish()

    #partMatchesCounted = False #if true then custom, else rough precision recall
    #num = 8;
    #removeStopWords = False

    conditions = list(
        itertools.product(
            [False, True],  #[False, True], #removeStopWords
            [
                False, True
            ],  #partMatchesCounted #if true then custom, else rough precision recall
            [4, 8]  #num of keywords
        ))

    df = None

    condition_i = 0

    for condition in conditions:
        removeStopWords = condition[0]
        partMatchesCounted = condition[
            1]  #if true then custom, else rough precision recall
        num = condition[2]

        print('condition: ', condition)

        if df is None:
            print('Read DF')
            df = df_source.copy()
            df['text'] = ''
            processor = Preprocessor(stopwordsList=None, lang=lang)
            sw = processor.stopwords
            #processor.stopwords = processor.get_normal_form_list(sw)
            processor.stopwords.extend(processor.get_normal_form_list(sw))

            if (lang == 'russian'):
                #df['text'] = df['Source Text']
                for index, row in tqdm(df.iterrows(), total=df.shape[0]):
                    df.at[index, 'text'] = processor.preprocess_text(
                        row['Source Text'],
                        removeStopWords=removeStopWords,
                        useLemmas=True,
                        applyLemmasToText=True)
            else:
                for index, row in tqdm(df.iterrows(), total=df.shape[0]):
                    text = processor.preprocess_text(
                        row['Source Text'],
                        removeStopWords=removeStopWords,
                        useLemmas=True)
                    df.at[index, 'text'] = text

        elif condition_i == 4:
            for index, row in tqdm(df.iterrows(), total=df.shape[0]):
                text = processor.preprocess_text(row['text'],
                                                 removeStopWords=True,
                                                 useLemmas=True,
                                                 applyLemmasToText=False)
                df.at[index, 'text'] = text

        condition_i = condition_i + 1

        rakeExtractor.extractKeywords(df,
                                      num=num,
                                      metricsCount=True,
                                      partMatchesCounted=partMatchesCounted,
                                      textprocessor=processor)

        tfidfBlobExtractor = TfIdfBlobExtractor(processor.getStopwords(),
                                                textprocessor=processor)
        tfidfBlobExtractor.extractKeywords(
            df,
            num=num,
            metricsCount=True,
            partMatchesCounted=partMatchesCounted)

        textRankExtractor = TextRankExtractor(processor.getStopwords(),
                                              language=lang,
                                              textprocessor=processor)
        textRankExtractor.extractKeywords(
            df,
            num=num,
            metricsCount=True,
            partMatchesCounted=partMatchesCounted)

        x = []
        y = {
            'rake': {
                'precision': [],
                'recall': [],
                'f1': []
            },
            'textrank': {
                'precision': [],
                'recall': [],
                'f1': []
            },
            'tfidf': {
                'precision': [],
                'recall': [],
                'f1': []
            }
        }

        for index, row in tqdm(df.iterrows(), total=df.shape[0]):
            x.append(index)

            values = row['textrank_metrics'].split(',')
            y['textrank']['precision'].append(values[0])
            y['textrank']['recall'].append(values[1])
            y['textrank']['f1'].append(values[2])

            values = row['tfidf_blob_metrics'].split(',')
            y['tfidf']['precision'].append(values[0])
            y['tfidf']['recall'].append(values[1])
            y['tfidf']['f1'].append(values[2])

            values = row['rake_metrics'].split(',')
            y['rake']['precision'].append(values[0])
            y['rake']['recall'].append(values[1])
            y['rake']['f1'].append(values[2])

        metrics = ['precision', 'recall', 'f1']
        for i in range(len(metrics)):
            fig, ax = plt.subplots()
            bar_values = [
                statistics.mean(list(map(float, y['textrank'][metrics[i]]))),
                statistics.mean(list(map(float, y['rake'][metrics[i]]))),
                statistics.mean(list(map(float, y['tfidf'][metrics[i]])))
            ]

            bar_label = [round(bv, 2) for bv in bar_values]  #add round

            bar_plot = plt.bar(['textrank', 'rake', 'tf-idf'], bar_values)
            autolabel(ax, bar_plot, bar_label)
            plt.ylim(0, max(max(bar_label) * 1.5, 0.01))
            title = (
                'Metric ' +
                str(metrics[i] + ' for ' + str(num) +
                    ' found keywords based on lemmatizied data in ' + lang +
                    ' (partMatchesCounted = ' + str(partMatchesCounted) + ')' +
                    ' (removeStopWords = ' + str(removeStopWords) + ')'))
            plt.title(title)  #'Quality metrics for ' + lang + ' language')
            plt.savefig(title + ".png", bbox_inches='tight')

            #if want to show only once upon a program
    plt.show()