Ejemplo n.º 1
0
bn_articles['LDR'] = bn_articles['LDR'].apply(
    lambda x: x.replace('naa', 'nab') if x[5:8] == 'naa' else x)

pbl_viaf_links = [
    '1cEz73dGN2r2-TTc702yne9tKfH9PQ6UyAJ2zBSV6Jb0',
    '1_Bhwzo0xu4yTn8tF0ZNAZq9iIAqIxfcrjeLVCm_mggM',
    '1L-7Zv9EyLr5FeCIY_s90rT5Hz6DjAScCx6NxfuHvoEQ'
]
pbl_viaf = pd.DataFrame()
for elem in pbl_viaf_links:
    df = gsheet_to_df(elem, 'pbl_bn').drop_duplicates()
    df = df[df['czy_ten_sam'] != 'nie'][['pbl_id', 'BN_id', 'BN_name']]
    df['BN_name'] = df['BN_name'].str.replace('\|\(', ' (').str.replace(
        '\;\|', '; ').str.replace('\|$', '')
    df['index'] = df.index + 1
    df = cSplit(df, 'index', 'BN_name', '\|').drop(columns='index')
    pbl_viaf = pbl_viaf.append(df)
pbl_viaf = pbl_viaf.drop_duplicates()

if mode == "with people":

    # doesn't work for articles (still good for books)

    tworca_i_dzial = """select tw.tw_tworca_id "pbl_id", dz.dz_dzial_id||'|'||dz.dz_nazwa "osoba_pbl_dzial_id_name"
                        from pbl_tworcy tw
                        full join pbl_dzialy dz on dz.dz_dzial_id=tw.tw_dz_dzial_id"""
    tworca_i_dzial = pd.read_sql(tworca_i_dzial,
                                 con=connection).fillna(value=np.nan)
    tworca_i_dzial['pbl_id'] = tworca_i_dzial['pbl_id'].apply(
        lambda x: '{:4.0f}'.format(x))
wsh.set_basic_filter()

df_articles_with_key_words = df.copy()[df['subject❦pl'].notnull()][[
    'creator', 'title❦pl', 'subject❦pl', 'tytuł czasopisma'
]].reset_index(drop=True)
s_key_words.df_to_sheet(df_articles_with_key_words,
                        sheet='słowa kluczowe dla artykułów',
                        index=0)
worksheet = key_word_sheet.worksheet('słowa kluczowe dla artykułów')
worksheet.freeze(rows=1)
worksheet.set_basic_filter()

print(f'artykuły ze słowami kluczowymi = {len(df_articles_with_key_words)}')
#artykuły ze słowami kluczowymi = 8270
df_articles_with_key_words['indeks'] = df_articles_with_key_words.index + 1
df_articles_with_key_words = cSplit(df_articles_with_key_words, 'indeks',
                                    'subject❦pl', '❦')

df_key_words = df_articles_with_key_words['subject❦pl'].str.lower(
).value_counts().reset_index().rename(columns={
    'index': 'słowo kluczowe',
    'subject❦pl': 'frekwencja'
})
s_key_words.df_to_sheet(df_key_words,
                        sheet='słowa kluczowe - statystyki',
                        index=0)
worksheet = key_word_sheet.worksheet('słowa kluczowe - statystyki')
#worksheet.clear()
#key_word_sheet.del_worksheet(worksheet)
worksheet.freeze(rows=1)
worksheet.set_basic_filter()
                              'X245', 'X650', 'X655'
                          ]]
    X100_field = marc_parser_1_field(to_remove, 'id', 'X100', '\$')
    X100_field['year'] = X100_field['$d'].apply(
        lambda x: re.findall('\d+', x)[0] if x != '' else np.nan)
    X100_field = X100_field[X100_field['year'].notnull()]
    X100_field = X100_field[X100_field['year'].astype(int) <= 1700]
    to_remove = to_remove[~to_remove['id'].isin(X100_field['id'])]

    bn_books = bn_books[~bn_books['id'].isin(to_remove['id'])]
    pbl_enrichment = bn_books[[
        'id', 'dziedzina_PBL', 'rodzaj_ksiazki', 'DZ_NAZWA', 'X650', 'X655'
    ]]
    pbl_enrichment['DZ_NAZWA'] = pbl_enrichment['DZ_NAZWA'].str.replace(
        ' - .*?$', '', regex=True)
    pbl_enrichment = cSplit(pbl_enrichment, 'id', 'X655', '❦')
    pbl_enrichment['jest x'] = pbl_enrichment['X655'].str.contains('\$x')
    pbl_enrichment['nowe650'] = pbl_enrichment.apply(
        lambda x: x['X655'] if x['jest x'] == True else np.nan, axis=1)
    pbl_enrichment['X655'] = pbl_enrichment.apply(
        lambda x: x['X655'] if x['jest x'] == False else np.nan, axis=1)
    pbl_enrichment['X650'] = pbl_enrichment[['X650', 'nowe650']].apply(
        lambda x: '❦'.join(x.dropna().astype(str)), axis=1)
    pbl_enrichment = pbl_enrichment.drop(['jest x', 'nowe650'], axis=1)

    query = "select * from pbl_enrichment a join gatunki_pbl b on lower(a.X655) like '%'||b.gatunek||'%'"
    gatunki1 = pandasql.sqldf(query)
    query = "select * from pbl_enrichment a join gatunki_pbl b on lower(a.X650) like '%'||b.gatunek||'%'"
    gatunki2 = pandasql.sqldf(query)
    gatunki = pd.concat([gatunki1, gatunki2]).drop_duplicates()
    gatunki['gatunek'] = gatunki['gatunek'].apply(
Ejemplo n.º 4
0
]

mapowanie_osob_df = pd.DataFrame()
for file in tqdm(mapowanie_osob):
    sheet = gc.open_by_key(file)
    df_osoby = get_as_dataframe(
        sheet.worksheet('pbl_bn'),
        evaluate_formulas=True).dropna(how='all').dropna(
            how='all', axis=1).drop_duplicates()
    df_osoby = df_osoby[df_osoby['czy_ten_sam'] != 'nie'][[
        'pbl_id', 'BN_id', 'BN_name'
    ]]
    df_osoby['BN_name'] = df_osoby['BN_name'].str.replace(
        '\|\(', ' (').str.replace('\;\|', '; ').str.replace('\|$', '')
    df_osoby['index'] = df_osoby.index + 1
    df_osoby = cSplit(df_osoby, 'index', 'BN_name', '\|').drop(columns='index')
    mapowanie_osob_df = mapowanie_osob_df.append(df_osoby)

mapowanie_osob_df = mapowanie_osob_df.drop_duplicates().reset_index(drop=True)


def rok_zgonu(x):
    try:
        return int(
            re.search('(?<=\- ca |\-ca |\-ok\. |\-|po )(\d+)', x).group(0))
    except (TypeError, AttributeError):
        return None


mapowanie_osob_df['rok zgonu'] = mapowanie_osob_df['BN_name'].apply(
    lambda x: rok_zgonu(x))
Ejemplo n.º 5
0
bar_catalog['008'] = bar_catalog['008'].str.strip().str.replace(
    '(^|.)(\%.)', '', regex=True).str.replace('([\+\!])', '-', regex=True)
bar_catalog['do 100'] = bar_catalog['100'].str.replace(r'(^.+?)(❦)(.+?$)',
                                                       r'\3',
                                                       regex=True)
bar_catalog['100'] = bar_catalog['100'].str.replace(r'(^.+?)(❦)(.+?$)',
                                                    r'\1',
                                                    regex=True)
bar_catalog['700'] = bar_catalog[['do 100', '700']].apply(
    lambda x: ''.join(x.dropna().astype(str)), axis=1)
del bar_catalog['do 100']
bar_catalog['100'] = bar_catalog['100'].str.replace(
    r'(?<=\%d)(\()(.+?)(\)\.{0,1})', r'\2',
    regex=True).str.replace('(?<=[a-zà-ž][a-zà-ž])\.$', '', regex=True)

bar_catalog = cSplit(bar_catalog, '001', '600', '❦')
bar_catalog['600'] = bar_catalog['600'].str.replace(
    r'(?<=\%d)(\()(.+?)(\)\.{0,1})', r'\2',
    regex=True).str.replace('(?<=[a-zà-ž][a-zà-ž])\.$', '', regex=True)
bar_catalog['787'] = bar_catalog['600'].str.replace('(\%d.+?)(?=\%)',
                                                    '',
                                                    regex=True).str.replace(
                                                        r'(?<=^)(..)',
                                                        r'08',
                                                        regex=True)
bar_catalog['787'] = bar_catalog['787'].apply(
    lambda x: x if pd.notnull(x) and '%t' in x else np.nan)
bar_catalog['600'] = bar_catalog['600'].str.replace('(\%t.+?)(?=\%|$)',
                                                    '',
                                                    regex=True).str.strip()
bar_catalog['600'] = bar_catalog.groupby('001')['600'].transform(
            data_person = tree.select_one('h2').text
        except:
            data_person = "brak danych (CR)"
        try:
            role = tree.select_one('h3').text
        except:
            role = "brak danych (CR)"
        try:
            description = tree.select_one('.indent').text
        except:
            description = "brak danych (CR)"
        data.append([data_person, role, description])
end_time = time.time()
print(end_time - start_time)

df = pd.DataFrame(data, columns=['data_person', 'role', 'description'])

df['person_year'] = df['data_person'].apply(lambda x: re.sub(
    '(.+)(, )(\d.+)', r'\3', x) if re.findall('\d{4}', x) else np.nan)
df['single_prize'] = df['description'].apply(
    lambda x: regex.sub('(\n)(\d{4} \p{Lu})', r'❦\2', x))
df['index'] = df.index + 1
df = cSplit(df, 'index', 'single_prize', '❦')
df['single_prize'] = df['single_prize'].str.replace('\n', '❦')
df = df[df['single_prize'].notnull()]
df['prize_year'] = df['single_prize'].str.replace('(^\d{4})(.+)', r'\1')
df['book_title_reason'] = df['single_prize'].apply(lambda x: re.sub(
    '(^\d{4} )(.+)(❦)(.+)', r'\4', x) if re.findall('❦', x) else np.nan)

df.iloc[9, 2]
Ejemplo n.º 7
0
liczba_tworcow = len(
    full_synat.copy()[full_synat['uwagi'].str.lower().str.contains('biograf')])

liczba_instytucji = len(full_synat.copy()[(full_synat['710'] != '') & (
    ~full_synat['516'].str.lower().str.contains('wydawnictw'))])

liczba_wydawnictw = len(full_synat.copy()[
    full_synat['516'].str.lower().str.contains('wydawnictw')])

#porządkowanie serwisów i czasopism

serwisy_portale = gsheet_to_df('1EWzb9mCsTVxYDqj_CzKW5EcqJy4z3a_-GyNGAbAjGH0',
                               'Serwisy, portale (finalny)')
s_p_adres = serwisy_portale.copy()[['id', 'adres']]
s_p_adres = cSplit(s_p_adres, 'id', 'adres', '❦')

s_p_adres = s_p_adres[s_p_adres['adres'].str.contains('http')]
s_p_adres['adres'] = s_p_adres['adres'].apply(
    lambda x: re.sub('(.+?u |^u |^3 )(h.+$)', r'\2', x))
s_p_adres['adres'] = s_p_adres['adres'].apply(
    lambda x: re.sub('(.+?)( .+)', r'\1', x))
s_p_adres['len'] = s_p_adres['adres'].apply(lambda x: len(x))
s_p_adres['adres'] = s_p_adres['adres'].apply(
    lambda x: x.replace('http://www.', 'http://'))
s_p_adres['id'] = s_p_adres['id'].astype(int)
s_p_adres = s_p_adres.sort_values(['id', 'len']).drop_duplicates()

test = s_p_adres.groupby('id').head(1).reset_index(drop=True).drop(
    columns='len')
pbl_ids = bn_stats.copy()['pbl_id'].drop_duplicates().astype(int).tolist()

pbl_query = """select z.za_zapis_id "record_id", zr.zr_zrodlo_id "pbl_id", z.za_ro_rok "year"
            from pbl_zapisy z
            join IBL_OWNER.pbl_zrodla zr on zr.zr_zrodlo_id=z.za_zr_zrodlo_id"""

pbl_stats = pd.read_sql(pbl_query, connection)
pbl_stats = pbl_stats[pbl_stats['pbl_id'].isin(pbl_ids)]
pbl_stats = pbl_stats.groupby(['pbl_id', 'year']).count()
pbl_stats = pbl_stats.reset_index(level=['pbl_id', 'year']).rename(
    columns={'record_id': 'liczba PBL'})

bn_stats['pbl_id'] = bn_stats['pbl_id'].astype(np.int64)
bn_stats['help'] = bn_stats['pbl_id'].astype(str) + '|' + bn_stats['year']
pbl_stats['help'] = pbl_stats['pbl_id'].astype(
    str) + '|' + pbl_stats['year'].astype(str)

stats = pd.merge(bn_stats, pbl_stats, 'left', 'help')
stats = stats[[
    'help', 'pbl_magazine', 'liczba BN', 'liczba BN ok',
    'procent literacki BN', 'liczba PBL'
]]
stats['index'] = stats.index + 1
stats = cSplit(stats, 'index', 'help', '|', 'wide')
stats = stats.rename(columns={
    'help_0': 'pbl_id',
    'help_1': 'year'
}).drop(columns=['index']).sort_values(['pbl_magazine', 'year'])

stats.to_excel('statystyki_czasopism_bn_pbl.xlsx', index=False)