Ejemplo n.º 1
0
def make_collocation_graph(target,
                           top=15,
                           urns=[],
                           cutoff=0,
                           cut_val=2,
                           before=4,
                           after=4,
                           limit=1000):
    """Make a cascaded network from collocations"""

    antall = Counter()
    for urn in urns:
        antall += get_freq(urn[0], top=0, cutoff=0)

    korpus_totalen = frame(antall, 'total')
    Total = korpus_totalen[korpus_totalen > cut_val]

    if isinstance(target, str):
        target = target.split()

    I = urn_coll_words(target,
                       urns=urns,
                       before=before,
                       after=after,
                       limit=limit)
    toppis = frame(I[0]**1.2 / Total['total'],
                   target[0]).sort_values(by=target[0], ascending=False)

    #toppis[:top].index

    isgraf = dict()
    for word in toppis[:top].index:
        if word.isalpha():
            isgraf[word] = urn_coll(word,
                                    urns=urns,
                                    before=before,
                                    after=after)

    isframe = dict()
    for w in isgraf:
        isframe[w] = frame(isgraf[w], w)

    tops = dict()
    if len(target) == 1:
        tops[target[0]] = toppis
    else:
        tops['_'.join(target[:2])] = toppis
    for w in isframe:
        tops[w] = frame(isframe[w][w]**1.2 / Total['total'],
                        w).sort_values(by=w, ascending=False)

    edges = []
    for w in tops:
        edges += [(w, coll) for coll in tops[w][:top].index if coll.isalpha()]

    Ice = nx.Graph()

    Ice.add_edges_from(edges)

    return Ice
Ejemplo n.º 2
0
def ngavis(word, period):
    try:
        if " " in word:
            bigram = word.split()[:2]
            res = nb.frame(
                nb.bigram(first=bigram[0],
                          second=bigram[1],
                          period=period,
                          media='avis'), word)
        else:
            res = nb.frame(nb.unigram(word, period=period, media='avis'), word)
        #st.write(res.head())
    except:
        res = pd.DataFrame()
    return res
Ejemplo n.º 3
0
def unigram(word,
            period=(1950, 2020),
            media='bok',
            ddk=None,
            topic=None,
            gender=None,
            publisher=None,
            lang=None,
            trans=None,
            name=None):
    r = requests.get("https://api.nb.no/ngram/unigrams",
                     params={
                         'word': word,
                         'ddk': ddk,
                         'topic': topic,
                         'gender': gender,
                         'publisher': publisher,
                         'lang': lang,
                         'trans': trans,
                         'period0': period[0],
                         'period1': period[1],
                         'media': media,
                         'name': name
                     })
    return nb.frame(dict(r.json()))
Ejemplo n.º 4
0
def coll_avis(word,
              title='%',
              before=5,
              after=5,
              datefrom="1800-01-01",
              dateto="2000-01-01",
              limit=1000):

    return nb.frame(
        nb.frame(
            coll_newspaper(word,
                           title=title,
                           before=before,
                           after=after,
                           datefrom=datefrom,
                           dateto=dateto,
                           limit=limit)).loc[0].transpose())
Ejemplo n.º 5
0
def phrase_plots_level(phrase_set,
                       title='*',
                       period=(20100101, 20301231),
                       media='aviser'):
    df_all = [
        nb.frame(get_df_level(f, title=title, period=period, media=media),
                 ', '.join(f)) for f in phrase_set
    ]
    df = pd.concat(df_all, sort=False, axis=1)
    return df
Ejemplo n.º 6
0
def phrase_plots_anno(phrase_sets, title='aftenposten', fra = 1960, til = 2020, rot=0, colours = ['r', 'b','g']):
    df_all = []
    for f in phrase_sets:
        df_all.append(nb.frame(get_df(f, title= title), ', '.join(f)))
    df = pd.concat(df_all, sort=False)
    df.index = df.index.astype(int)
    df = df.sort_index()
    #df['bins'] = pd.cut(df.index, range(fra, til, step), precision=0)
    df.plot(kind='bar', figsize=(15,5), rot=rot, color=colours)
    return
Ejemplo n.º 7
0
def make_dtm(texts):
    dtm = pd.DataFrame()
    freqs = dict()
    for text in texts.keys():
        print(text)
        c = Counter()
        for p in texts[text]:
            c.update(p)
        freqs[text] = nb.frame(c, text)
    dtm = pd.concat([freqs[text] for text in freqs.keys()], axis=1, sort=False)
    return dtm
Ejemplo n.º 8
0
def ngbok(word, period, ddk=None, lang='nob'):
    try:
        if " " in word:
            bigram = word.split()[:2]
            res = nb.frame(
                nb.bigram(first=bigram[0],
                          second=bigram[1],
                          ddk=ddk,
                          period=period,
                          media='bok',
                          lang=lang), word)
        else:
            res = nb.frame(
                nb.unigram(word,
                           period=period,
                           ddk=ddk,
                           media='bok',
                           lang=lang), word)
    except:
        res = pd.DataFrame()
    return res
Ejemplo n.º 9
0
def create_frame(coll, expected):
    df = nb.frame(frame(coll).transpose(), 'freq doc dist'.split())
    df['score'] = dist(df['dist'], expected, df['freq'])
    return df
Ejemplo n.º 10
0
            # try again - things may have loaded on the server...
            print('prøver en gang til for: ', (year, year + step))
            try:
                colls[(year, year + step)] = collocation(word,
                                                         yearfrom=year,
                                                         yearto=year + step,
                                                         corpus='avis',
                                                         before=before,
                                                         after=after)
            except:
                print('klarte ikke: ', (year, year + step))
    colls_df = colls2df(colls, calculate_midpoint(before, after))
    return colls_df, score_df(colls_df)


score_df = lambda df: nb.frame({x: df[x]['score'] for x in df}).transpose()
display_vals = lambda kr_df, word, clip=0: kr_df[kr_df >= clip].loc[word]


def show_frame(df,
               colnum=0,
               clip=0,
               fillval=10,
               cmap='Blues',
               up=True,
               axis=0,
               first_row=0,
               number_of_rows=20):
    if up == True:
        cmap = cmap + '_r'
        dfc = df[df >= clip]
Ejemplo n.º 11
0
def ngbok(x, period, ddk=None):
    try:
        r = nb.frame(nb.unigram(x, period, media='bok', ddk=ddk), x)
    except:
        r = pd.DataFrame()
    return r
Ejemplo n.º 12
0
def frm(x, y):
    if not x.empty:
        res = nb.frame(x, y)
    else:
        res = x
    return res
Ejemplo n.º 13
0
#st.line_chart(tot)

#if st.button('Sjekk fordeling i bøker'):
if antall > 0:

    wordlist = allword

    urns = {
        w: nb.book_urn(words=[w],
                       ddk=ddk,
                       period=(period_slider[0], period_slider[1]),
                       limit=antall)
        for w in wordlist
    }
    #data = {w: nb.aggregate_urns(urns[w]) for w in wordlist}
    #st.write([(w,urns[w]) for w in wordlist])
    urner = lambda w: [x[0] for x in urns[w]]
    #st.write(urner(wordlist[0]))
    data = {'bok ' + w: nb.word_freq(urner(w), wordlist) for w in wordlist}

    st.markdown(
        "### Bøker som inneholder en av _{ws}_ i kolonnene, ordfrekvens i radene"
        .format(ws=', '.join(wordlist)))

    st.write('En diagonal indikerer at ordene gjensidig utelukker hverandre')

    st.write(nb.frame(data).transpose().fillna(0))

    #st.write(df.loc[wordlist].fillna(0))
Ejemplo n.º 14
0
if ddk == "":
    ddk = None

if ddk != None and not ddk.endswith("%"):
    ddk = ddk + "%"

antall = st.number_input(
    'Antall bøker - jo fler jo lenger ventetid, forskjellige søk vil vanligvis gi nye bøker (trykk på +/- for starte nye søk',
    10)

period_slider = st.slider('Angi periode - år mellom 1900 og 2014', 1900, 2020,
                          (1950, 2010))

if words != "":
    urns = {
        w: nb.book_urn(words=[w],
                       ddk=ddk,
                       period=(period_slider[0], period_slider[1]),
                       limit=antall)
        for w in wordlist
    }
    data = {w: nb.aggregate_urns(urns[w]) for w in wordlist}

    df = pd.concat([nb.frame(data[w], 'bøker ' + w) for w in wordlist], axis=1)

    st.markdown(
        "### Bøker som inneholder en av _{ws}_ i kolonnene, ordfrekvens i radene"
        .format(ws=', '.join(wordlist)))
    st.write('En diagonal indikerer at ordene gjensidig utelukker hverandre')
    st.write(df.loc[wordlist].fillna(0))
Ejemplo n.º 15
0
def collocations_from_nb(word, corpus, func = get_konkordanser):
    """Get a concordance, and count the words in it. Assume konks reside a dataframe with columns 'after' and 'before'"""
    concordance = nb.frame(func(word, corpus))
    return nb.frame_sort(nb.frame(Counter(tokenize(' '.join(concordance['after'].values + concordance['before'].values))), word))
Ejemplo n.º 16
0
def count_from_conc(concordance):
    """From a concordance, count the words in it. Assume konks reside a dataframe with columns 'after' and 'before'"""
    word = concordance['word'][0]
    return nb.frame_sort(nb.frame(Counter(tokenize(' '.join(concordance['after'].values + concordance['before'].values))), word))
Ejemplo n.º 17
0
def ngavis(x, period):
    try:
        r = nb.frame(nb.unigram(x, period, media='avis'), x)
    except:
        r = pd.DataFrame()
    return r