Beispiel #1
0
def prepare_viz(doc_ids, docdict, doccats, x, y, catdesc={}, filepath='docs.json'):
    """
    function to prepare text data for 2 dim visualization by saving a json file, that is a list of dicts,
    where each dict decodes 1 doc with "id" (doc_id), "x" and "y" (2dim coordinates derived from the kernel matrix
    using classical scaling), "title" (category/ies), "description" (whatever is in docdict at doc_id), "color" (for cat)
    Input:
        doc_ids: list with keys for docdict and doccats
        docdict: dict with docid:'description'
        doccats: dict with docid: cat
        x, y: 2d coordinates for all data points in the order of doc_ids (use x, y = proj2d(K, use_tsne, evcrit))
        catdesc: category descriptions
        filepath: where the json file will be saved
    """
    # pretty preprocessing
    categories = set(invert_dict0(doccats).keys())
    if not catdesc:
        catdesc = {cat:cat for cat in categories}    
    colorlist = get_colors(len(categories))
    colordict = {cat:(255*colorlist[i][0],255*colorlist[i][1],255*colorlist[i][2]) for i, cat in enumerate(sorted(categories))}
    # save as json
    print("saving json")
    data_json = []
    for i, key in enumerate(doc_ids):
        data_json.append({"id":key,"x":x[i],"y":y[i],"title":str(key)+" (%s)"%catdesc[doccats[key]],"description":docdict[key],"color":"rgb(%i,%i,%i)"%colordict[doccats[key]]})
    with open(filepath,"w") as f:
        f.write(json.dumps(data_json,indent=2))
Beispiel #2
0
def basic_viz(doc_ids, doccats, x, y, catdesc={}, title=''):
    """
    plot a scatter plot of the data in 2d
    Input:
        doc_ids: list with keys for docdict and doccats
        doccats: dict with docid: cat
        x, y: 2d coordinates for all data points in the order of doc_ids (use x, y = proj2d(K, use_tsne, evcrit))
        catdesc: category descriptions (for legend)
    """
    # pretty preprocessing
    categories = set(invert_dict0(doccats).keys())
    if not catdesc:
        catdesc = {cat:cat for cat in categories}    
    colorlist = get_colors(len(categories))
    colordict = {cat:(colorlist[i][0],colorlist[i][1],colorlist[i][2]) for i, cat in enumerate(sorted(categories))}
    # plot scatter plot
    plt.figure()
    for j, cat in enumerate(sorted(categories)):
        # get docids that belong to the current category
        didx_temp = [i for i, did in enumerate(doc_ids) if cat == doccats[did]]
        plt.plot(x[didx_temp], y[didx_temp], 'o', label=catdesc[cat], color=colordict[cat], alpha=0.6, markeredgewidth=0)
    plt.xticks([],[])
    plt.yticks([],[])
    #plt.axis('equal')
    plt.title(title)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), numpoints=1)
Beispiel #3
0
def basic_viz(doc_ids, doccats, x, y, catdesc={}, title=''):
    """
    plot a scatter plot of the data in 2d
    Input:
        doc_ids: list with keys for docdict and doccats
        doccats: dict with docid: cat
        x, y: 2d coordinates for all data points in the order of doc_ids (use x, y = proj2d(K, use_tsne, evcrit))
        catdesc: category descriptions (for legend)
    """
    # pretty preprocessing
    categories = set(invert_dict0(doccats).keys())
    if not catdesc:
        catdesc = {cat: cat for cat in categories}
    colorlist = get_colors(len(categories))
    colordict = {
        cat: (colorlist[i][0], colorlist[i][1], colorlist[i][2])
        for i, cat in enumerate(sorted(categories))
    }
    # plot scatter plot
    plt.figure()
    for j, cat in enumerate(sorted(categories)):
        # get docids that belong to the current category
        didx_temp = [i for i, did in enumerate(doc_ids) if cat == doccats[did]]
        plt.plot(x[didx_temp],
                 y[didx_temp],
                 'o',
                 label=catdesc[cat],
                 color=colordict[cat],
                 alpha=0.6,
                 markeredgewidth=0)
    plt.xticks([], [])
    plt.yticks([], [])
    #plt.axis('equal')
    plt.title(title)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), numpoints=1)
def visualize_tfidf(textdict, doccats, create_html=True, visids=[], subdir_html='', subdir_wc='', maskfiles={}):
    """
    visualize a text categorization dataset w.r.t. tf-idf features (create htmls with highlighted words and word clouds)

    Input:
        textdict: dict with {doc_id: text}
        doccats: dict with {doc_id: category}
        create_html: whether to create the html files with scores highlighted for individual documents (default: True)
        visids: a subset of docids for which the html visualization should be created (optional)
                (if create_html=True but visids=[], select up to 1000 random ids)
        subdir_html: subdirectory to save the created html files in (has to exist)
        subdir_wc: subdirectory to save the created word cloud images in (has to exist)
        maskfiles: dict with {category: path_to_maskfile} for creating the word clouds in a specific form
    Returns:
        relevant_words: dict with {category: {word: relevancy score}}
    """
    print("possibly selecting subset of 10000 examples")
    textdict, doccats, visids = select_subset(textdict, doccats, visids)
    print("transforming text into features")
    # we can identify bigrams if we don't have to create htmls
    ft = FeatureTransform(norm='max', weight=True, renorm='max', identify_bigrams=not create_html, norm_num=False)
    docfeats = ft.texts2features(textdict)
    # maybe highlight the tf-idf scores in the documents
    if create_html:
        print("creating htmls for %i of %i documents" % (len(visids), len(docfeats)))
        for i, did in enumerate(visids):
            if not i % 100:
                print("progress: at %i of %i documents" % (i, len(visids)))
            metainf = did + '\n' + 'True Class: %s\n' % doccats[did]
            name = did + '_' + doccats[did]
            scores2html(textdict[did], docfeats[did], os.path.join(subdir_html, name.replace(' ', '_').replace('/', '_')), metainf)
    # get a map for each category to the documents belonging to it
    catdocs = invert_dict0(doccats)
    # create word clouds for each category by summing up tfidf scores
    scores_collected = {}
    for cat in catdocs:
        print("creating word cloud for category %r with %i samples" % (cat, len(catdocs[cat])))
        scores_collected[cat] = {}
        for did in catdocs[cat]:
            scores_collected[cat] = combine_dicts(scores_collected[cat], docfeats[did], sum)
        # create word cloud
        create_wordcloud(scores_collected[cat], os.path.join(subdir_wc, "%s.png" % cat), maskfiles[cat] if cat in maskfiles else None)
    return scores_collected
Beispiel #5
0
def prepare_viz(doc_ids,
                docdict,
                doccats,
                x,
                y,
                catdesc={},
                filepath='docs.json'):
    """
    function to prepare text data for 2 dim visualization by saving a json file, that is a list of dicts,
    where each dict decodes 1 doc with "id" (doc_id), "x" and "y" (2dim coordinates derived from the kernel matrix
    using classical scaling), "title" (category/ies), "description" (whatever is in docdict at doc_id), "color" (for cat)
    Input:
        doc_ids: list with keys for docdict and doccats
        docdict: dict with docid:'description'
        doccats: dict with docid: cat
        x, y: 2d coordinates for all data points in the order of doc_ids (use x, y = proj2d(K, use_tsne, evcrit))
        catdesc: category descriptions
        filepath: where the json file will be saved
    """
    # pretty preprocessing
    categories = set(invert_dict0(doccats).keys())
    if not catdesc:
        catdesc = {cat: cat for cat in categories}
    colorlist = get_colors(len(categories))
    colordict = {
        cat:
        (255 * colorlist[i][0], 255 * colorlist[i][1], 255 * colorlist[i][2])
        for i, cat in enumerate(sorted(categories))
    }
    # save as json
    print("saving json")
    data_json = []
    for i, key in enumerate(doc_ids):
        data_json.append({
            "id": key,
            "x": x[i],
            "y": y[i],
            "title": str(key) + " (%s)" % catdesc[doccats[key]],
            "description": docdict[key],
            "color": "rgb(%i,%i,%i)" % colordict[doccats[key]]
        })
    with open(filepath, "w") as f:
        f.write(json.dumps(data_json, indent=2))
Beispiel #6
0
def check_occurrences(textdict, doccats, queries):
    """
    For all queries, check how often they occur in documents of a specific class

    Inputs:
        textdict: dict with {doc_id: text}
        doccats: dict with {doc_id: category}
        queries: some queries to check for; either strings or using check_and and check_or, e.g.
                 ['hello', check_and('italy', 'earthquake'), check_or('trump', 'obama')]
                 - due to preprocessing constraints, all query words have to be single words!
    Returns:
        results: a dict with {query: {category: frequency}}, e.g.
                 {'hello': {'politics': 0., 'world': 0.01},
                  'and:(italy, earthquake)': {'politics': 0.1, 'world': 0.2},
                  'or:(trump, obama)': {'politics': 0.9, 'world': 0.1}}
    """
    # invert doccats to get for every category the list of documents in it
    catdocs = invert_dict0(doccats)
    # do some preprocessing
    textdict = {
        did: set(re.findall(r"[a-z0-9-]+", textdict[did].lower()))
        for did in textdict
    }
    # check for all queries
    results = {}
    for q in queries:
        # convert regular string queries into functions as well
        if isinstance(q, str):
            q = check_in(q)
        # split in name to store results and query itself
        q, str_q = q
        results[str_q] = {}
        for cat in catdocs:
            results[str_q][cat] = len([
                1 for did in catdocs[cat] if q(textdict[did])
            ]) / float(len(catdocs[cat]))
    return results
def get_distinctive_words(textdict,
                          doccats,
                          distinctive_fun=distinctive_fun_quotdiff):
    """
    For every category, find distinctive (i.e. `distinguishing') words by comparing how often the word each word
    occurs in this target category compared to all other categories.

    Input:
        - textdict: a dict with {docid: text}
        - doccats: a dict with {docid: cat} (to get trends in time, cat could also be a year/day/week)
        - distinctive_fun: which formula should be used when computing the score (default: distinctive_fun_quotdiff)
    Returns:
        - distinctive_words: a dict with {cat: {word: score}},
          i.e. for every category the words and a score indicating
          how relevant the word is for this category (the higher the better)
          you could then do sorted(distinctive_words[cat], key=distinctive_words[cat].get, reverse=True)[:10]
          to get the 10 most distinguishing words for that category
    """
    # transform all texts into sets of preprocessed words and bigrams
    print("computing features")
    ft = FeatureTransform(norm='max',
                          weight=False,
                          renorm=False,
                          identify_bigrams=True,
                          norm_num=False)
    docfeats = ft.texts2features(textdict)
    #docfeats = {did: set(docfeats[did].keys()) for did in docfeats}
    # invert this dict to get for every word the documents it occurs in
    # word_dids = {word: set(dids) for word, dids in invert_dict1(docfeats).items()}
    # invert the doccats dict to get for every category a list of documents belonging to it
    cats_dids = {cat: set(dids) for cat, dids in invert_dict0(doccats).items()}
    # get a list of all words
    word_list = list(invert_dict2(docfeats).keys())
    # count the true positives for every word and category
    print("computing tpr for all words and categories")
    tpc_words = {}
    for word in word_list:
        tpc_words[word] = {}
        for cat in cats_dids:
            # out of all docs in this category, in how many did the word occur?
            #tpc_words[word][cat] = len(cats_dids[cat].intersection(word_dids[word])) / len(cats_dids[cat])
            # average tf score in the category
            # (don't just take mean of the list comprehension otherwise you're missing zero counts)
            tpc_words[word][cat] = sum([
                docfeats[did][word]
                for did in cats_dids[cat] if word in docfeats[did]
            ]) / len(cats_dids[cat])
    # for every category, compute a score for every word
    distinctive_words = {}
    for cat in cats_dids:
        print("computing distinctive words for category %r" % cat)
        distinctive_words[cat] = {}
        # compute a score for every word
        for word in word_list:
            # in how many of the target category documents the word occurs
            tpr = tpc_words[word][cat]
            if tpr:
                # in how many of the non-target category documents the word occurs (mean+std)
                fprs = [tpc_words[word][c] for c in cats_dids if not c == cat]
                fpr = np.mean(fprs) + np.std(fprs)
                # compute score
                distinctive_words[cat][word] = distinctive_fun(tpr, fpr)
    return distinctive_words
                               'old': os.path.join('maskimgs', 'down.png')
                           })
 # visualize w/o html using clf (to get bigram word clouds)
 _ = visualize_clf(textdict,
                   doccats,
                   create_html=False,
                   subdir_wc=os.path.join(resdir, 'nytimes_wc_clf'),
                   maskfiles={
                       'current': os.path.join('maskimgs', 'up.png'),
                       'old': os.path.join('maskimgs', 'down.png')
                   })
 # Accuracy: 0.713
 ### experiment 2: cluster articles from the inauguration week
 textdict, doccats = get_articles('2017-01-16', '2017-01-22')
 clusters = cluster_texts(textdict)
 cluster_docs = invert_dict0(clusters)
 _ = visualize_distinctive(textdict,
                           clusters,
                           subdir_wc=os.path.join(
                               resdir, 'nytimes_wc_distinctive_clusters'))
 for c in sorted(cluster_docs,
                 key=lambda x: len(cluster_docs[x]),
                 reverse=True):
     print("#### %i documents in cluster %i" % (len(cluster_docs[c]), c))
     if not c == -1:
         for did in cluster_docs[c]:
             print(textdict[did].split("\n")[0])
 ### experiment 3: check the occurrences of some specific words
 textdict, doccats = get_articles('2016-12-26', '2017-01-22')
 queries = [
     check_or('and', 'or', 'the'), 'tuesday', 'trump', 'obama',