Beispiel #1
0
def get_topics(df,
               dictionary,
               corpus,
               num_topics,
               name,
               method="GSDMM-Rust",
               alpha=0.1,
               beta=0.1):
    assert len(df) == len(corpus)
    if method == "LDA":
        dictionary, topics, scores = sentences_to_topic_model(
            sentences, args.num_topics)
    elif method == "GSDMM-Rust":
        dictionary, topics, scores = sentences_to_gsdmm_rust(dictionary,
                                                             corpus,
                                                             args.num_topics,
                                                             name,
                                                             alpha=alpha,
                                                             beta=beta)
    elif method == "GSDMM":
        dictionary, topics, scores = sentences_to_gsdmm(
            sentences, args.num_topics)

    print("exporting dictionary and nparray")

    # export everything
    dictionary.save(getFile(name, Datafile.DICTIONARY))
    np.save(getFile(name, Datafile.TOPIC_NDARRAY), topics)

    print("preparing scores")
    scores_df = scores
    assert (scores.index == df.index).all()
    scores_df.to_csv(getFile(name, Datafile.SCORES), sep="\t", index=False)
    scores_sums = scores_df.sum()
    scores_sums = scores_sums.sort_index()
    # Write this here because the graph algo step requires this info
    # and the analyze_topics.py as it is right now depends on the graph algo
    # output
    records = pd.DataFrame(scores_sums,
                           columns=["size"]).to_dict(orient="index")
    with open(getFile(name, Datafile.TOPIC_JSON), "wt") as f:
        f.write(json.dumps(records))

    scores_df["dominant_topic"] = scores_df.idxmax(axis=1)
    scores_df["title"] = df["title"]
    assert scores_df["title"].notnull().all()
    scores_df["url"] = df["url"]
    scores_df["publish_date"] = df["publish_date"]
    media_names = df["media_name"].fillna("No Media Name")
    scores_df["media_name"] = media_names
    scores_df[["dominant_topic", "title", "media_name", "url",
               "publish_date"]].to_csv(getFile(name, Datafile.HEADLINES_TSV),
                                       sep="\t",
                                       index=False)
    assert len(scores_df) == len(corpus)

    return topics
Beispiel #2
0
def get_degrees(name, topics_json):
    # Maybe this code should live in cluster_topics?

    graph = nx.read_gpickle(getFile(name, Datafile.GRAPH_PICKLE))
    for topic in topics_json:
        try:
            topics_json[topic]["degree"] = graph.degree[topic]
        except:
            pdb.set_trace()
    return topics_json
Beispiel #3
0
def get_word_relevance(name, topics_json):
    print("Calcuating word relevance")

    dictionary = corpora.Dictionary.load(getFile(name, Datafile.DICTIONARY))
    LAMBDA = 0.9
    topic_ndarray = np.load(getFile(name, Datafile.TOPIC_NDARRAY))
    ps_token_corpus = np.array([
        dictionary.cfs[token] / dictionary.num_pos
        for token in dictionary.keys()
    ])
    for topic, row in enumerate(topic_ndarray):
        if topic not in topics_json:
            continue
        sum_topic = np.sum(row)
        topic_word_relevance = (
            row / sum_topic * LAMBDA +
            (1 - LAMBDA) * row / sum_topic / ps_token_corpus)
        top_relevant_tokens = np.argsort(topic_word_relevance)[::-1][0:20]
        top_common_tokens = np.argsort(row)[::-1][0:20]
        BLACKLIST = [
            "ma_zone_forecast",
            "lottery_state_by",
            "mobile_world",
            "ct_boston_norton",
            "richard_grenell",
            "east_africa",
            "credit_cards",
        ]
        topics_json[topic]["relevant_words"] = [
            [dictionary.id2token[tok], topic_word_relevance[tok]]
            for tok in top_relevant_tokens
            if dictionary.id2token[tok] not in BLACKLIST
        ]
        topics_json[topic]["common_words"] = [
            [dictionary.id2token[tok], topic_word_relevance[tok]]
            for tok in top_common_tokens
            if dictionary.id2token[tok] not in BLACKLIST
        ]
        # del topics_json[str(topic)]["words"]

    return topics_json
Beispiel #4
0
    def export(self):
        """
        Export to a format that we can pass to JS frontend
        Honestly, the headlines might be slower to fetch
        We might need to build a backend LOL
        We'll just dump the points, edges, and elevations basically
        """
        vx_df = pd.DataFrame(self.pts, columns=["x", "y"])
        # df.to_csv(getFile(self.name, Datafile.POINTS_TSV), sep="\t")
        # df = pd.DataFrame(self.vor.vertices, columns=["x", "y"])

        # water_coordinates, water_regions = self.compress_water()

        df = pd.DataFrame()
        df["elevation"] = self.elevation
        # df["coordinates"] = self.vor.regions
        df["coordinates"] = [
            json.dumps(row.tolist()) for row in self.delaunay.simplices
        ]
        df["headlines"] = self.headlines
        df["is_edge"] = self.edges
        # ugly but i hate np
        df["topics"] = self.topics
        df["flux"] = self.flux_map
        df["moisture"] = self.moisture
        df["temperature"] = self.temperature
        df["shadow"] = self.shadow

        # df = df[~df.index.isin(water_regions)]
        df = df[df["elevation"] > 0]
        df.to_csv(getFile(self.name, Datafile.REGIONS_TSV), sep="\t")

        used_vxs = set()
        for c in df["coordinates"]:
            used_vxs.update(json.loads(c))
        vx_df = vx_df.reset_index().iloc[list(used_vxs)]
        vx_df.to_csv(getFile(self.name, Datafile.VERTICES_TSV), sep="\t")
Beispiel #5
0
    def add_lakes(self):
        """
        Add lakes if required given the input graph
        """
        graph = nx.read_gpickle(getFile(name, Datafile.GRAPH_PICKLE))
        # Basically, we want to examine each Voronoi ridge
        # and if its corresponding edge doesn't exist in the input graph,
        # then we "kill" it with a lake
        extra_points = []
        for i, pair in enumerate(self.rough_voronoi.ridge_points):
            topicA = self.get_rough_topic(pair[0])
            topicB = self.get_rough_topic(pair[1])
            if topicA == -1 or topicB == -1:
                # One of them is a water cell, so w/e
                continue
            if topicA == topicB:
                # bordering yourself is OK
                continue
            if topicB in graph[topicA]:
                # The edge exists in the graph, so it's OK.
                continue
            # If we reach this point, we have a Voronoi ridge
            # that links two topics that SHOULD NOT have an edge
            ridge_vertices = self.rough_voronoi.vertices[
                self.rough_voronoi.ridge_vertices[i]]
            extra_points.extend(self.create_lake_points(ridge_vertices))

        self.set_rough_points(
            self.rough_topic_points,
            pd.concat(
                (
                    self.rough_points,
                    pd.DataFrame(extra_points, columns=["x", "y"]),
                ),
                ignore_index=True,
            ),
        )

        plt_voronoi(self.rough_points, self.topic_df)
Beispiel #6
0
        required=False,
        help=
        "if provided, a group number to restrict to (useful for debugging)",
    )
    parser.add_argument(
        "-matplotlib",
        dest="matplotlib",
        action="store_const",
        const=True,
        required=False,
        help="if provided, output intermediate matplotlibs",
    )
    args = parser.parse_args()
    name = names.getName(args.name, args.start, args.interval)

    topic_df = pd.read_csv(getFile(name, Datafile.TOPIC_METADATA_TSV),
                           sep="\t",
                           index_col=0)

    with open(getFile(name, Datafile.LAYOUT), "rt") as f:
        layout = json.load(f)["layouts"]
        layout = sorted([item for sl in layout for item in sl],
                        key=lambda x: x["id"])
        layout_df = pd.DataFrame(layout)
        layout_df.index = layout_df["id"]
    # this should be moved somewhere else :P
    assert (topic_df.index == layout_df.index).all()
    topic_df["x"] = layout_df["x"]
    topic_df["y"] = layout_df["y"]
    topic_df["group"] = layout_df["group"]
Beispiel #7
0
def analyze_topics(name, headlines=None, scores=None):
    if headlines == None:
        headlines = pd.read_csv(open(getFile(name, Datafile.HEADLINES_TSV)),
                                sep="\t")
    if scores == None:
        scores = pd.read_csv(open(getFile(name, Datafile.SCORES)), sep="\t")

    # rps = np.genfromtxt(getFile(name, Datafile.RUST_PROBABILITIES), delimiter=",")
    # headlines = headlines[headlines["title"].notnull()].reset_index()
    assert len(headlines) == len(scores)

    scores.columns = scores.columns.astype(int)
    scores_sums = scores.sum()
    scores_sums = scores_sums.sort_index()

    sentiments = np.zeros(len(headlines))
    """
    for i, row in headlines.iterrows():
        try:
            sentiments[i] = score_sentiment(row["title"])
        except:
            pdb.set_trace()
    """

    headlines["subjectivity"] = sentiments

    subj_map = headlines.groupby("dominant_topic").mean("subjectivity")
    count_map = headlines.groupby("dominant_topic").count()["title"]
    # get normalized count by media_name
    media_diversity = (headlines.groupby(
        ["dominant_topic", "media_name"]).count()["title"].unstack().fillna(
            0).apply(lambda x: x / np.sum(x)).apply(
                lambda x: scipy.stats.mstats.gmean(x) / np.mean(x), axis=1))
    assert (scores_sums.index == subj_map.index).all()
    subj_map["media_diversity"] = media_diversity
    subj_map["count"] = count_map
    subj_map["size"] = scores_sums
    subj_map.to_csv(getFile(name, Datafile.TOPIC_METADATA_TSV), sep="\t")
    records = subj_map.to_dict(orient="index")
    for topic in subj_map.index:
        recent_headlines = scores[topic][
            scores[topic] > 0.9999].iloc[::-1][0:100]
        try:
            hdf = headlines.iloc[recent_headlines.index][[
                "title", "url", "media_name", "publish_date"
            ]]
            records[topic]["articles"] = hdf.to_dict(orient="records")
        except:
            pdb.set_trace()

    records = get_degrees(name, records)
    records = get_word_relevance(name, records)

    for topic in records:
        records[topic]["region_name"] = get_name(records, topic)

    topic_json = json.load(open(getFile(name, Datafile.TOPIC_JSON)))
    for topic in topic_json:
        topic_json[topic].update(records[int(topic)])

    with open(getFile(name, Datafile.TOPIC_JSON), "wt") as f:
        f.write(json.dumps(topic_json))
Beispiel #8
0
def sentences_to_gsdmm_rust(dictionary,
                            corpus,
                            num_topics,
                            name,
                            alpha=0.1,
                            beta=0.1):
    # prepare files for consumption by rust executable:
    # vocabfile: one token per line
    print("preparing input for gsdmm-rust")
    with open("data/vocabfile.txt", "wt") as f:
        for t in dictionary.itervalues():
            f.write(t)
            f.write("\n")
    index = []
    with open("data/sentences.txt", "wt") as f:
        for i, doc in enumerate(corpus):
            arr = []
            for tok in doc:
                for _ in range(tok[1]):
                    arr.append(dictionary.id2token[tok[0]])
            if arr:
                f.write(" ".join(arr))
                f.write("\n")
                index.append(i)

    print("spawning gsdmm-rust subprocess")
    # spawn the rust subprocess
    stream_p = subprocess.Popen(
        [
            "gsdmm-rust/target/release/gsdmm",
            "data/sentences.txt",
            "data/vocabfile.txt",
            f"data/{name}",
            "-k",
            str(num_topics),
            "-a",
            f"{alpha}",
            "-b",
            f"{beta}",
            "-m",
            "50",
        ],
        shell=False,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    while True:
        output = stream_p.stdout.readline()
        if stream_p.poll() is not None:
            break
        if output:
            print(output.strip())

    # now read the cluster descriptions file into an ndarray
    print("retrieving gsdmm-rust output")
    mapping = dictionary.token2id
    topic_ndarray = np.zeros((num_topics, len(mapping)))

    with open(getFile(name, Datafile.RUST_CLUSTER_DESC), "rt") as f:
        while True:
            line = f.readline().strip()
            if not line:
                break
            line = line.split(" ")
            cluster_i = int(line[0])
            cluster_words = line[1:]
            for pair in cluster_words:
                comps = pair.split(":")
                token = comps[0]
                val = int(comps[1])
                tokid = mapping[token]
                topic_ndarray[cluster_i][tokid] = val

    scores = []
    with open(getFile(name, Datafile.RUST_LABELS), "rt") as f:
        while True:
            line = f.readline().strip()
            if not line:
                break
            comps = line.split(",")
            scores.append({int(comps[0]): float(comps[1])})
    scores = pd.DataFrame(scores).fillna(0)
    scores.index = index
    return dictionary, topic_ndarray, scores
Beispiel #9
0
        nonempty = [i for i, val in enumerate(corpus) if val != []]
        corpus = [i for i in corpus if i != []]
        df = df.iloc[nonempty].reset_index()

        topics = get_topics(
            df,
            dictionary,
            corpus,
            args.num_topics,
            name,
            alpha=args.alpha,
            beta=args.beta,
        )

        print(f"saving topics and dictionaries...")
        dictionary.save(getFile(name, Datafile.DICTIONARY))
        np.save(getFile(name, Datafile.TOPIC_NDARRAY), topics)

    elif args.load:
        basename = args.load
        name = names.getName(basename, args.start, args.interval)
        topics = np.load(getFile(name, Datafile.TOPIC_NDARRAY))
        dictionary = corpora.Dictionary.load(getFile(name,
                                                     Datafile.DICTIONARY))

    name2 = None
    if args.step:
        print("calculating intertopic distances")
        name2 = names.getPrevName(basename, args.start, args.interval,
                                  args.step)
        topic2_filename = getFile(name2, Datafile.TOPIC_NDARRAY)