def run_random_walks(data_dir, weight_edges=False):
    print "Loading data and building transition matrix..."
    examples = util.load_json('./data/' + data_dir + '/examples.json')
    G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int)
    if weight_edges:
        reviews = util.load_json('./data/' + data_dir + '/review.json')
        end_date = datetime.date(2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1)
        edges = G.edges()
        for e in util.logged_loop(edges, util.LoopLogger(20000, len(edges), True)):
            n1, n2 = str(e[0]), str(e[1])
            if n1 not in reviews or n2 not in reviews[n1]:
                n1, n2 = n2, n1
            G[e[0]][e[1]]['weight'] = 1.0 / ((end_date - get_date(reviews[n1][n2][0])).days + 90)
        del reviews  # save some memory

    adjacency_matrix = nx.adjacency_matrix(G)
    inverse_degree_matrix = sparse.diags([[1.0 / adjacency_matrix.getrow(i).sum()
                                           for i in range(adjacency_matrix.shape[0])]], [0])
    transition_matrix = inverse_degree_matrix.dot(adjacency_matrix)

    print "Running random walks..."
    for u in util.logged_loop(examples, util.LoopLogger(10, len(examples), True)):
        p = run_random_walk(transition_matrix, int(u), 10).todense()
        for b in examples[u]:
            examples[u][b] = p[0, int(b)]

    util.write_json(examples, './data/' + data_dir
                    + ('/weighted_random_walks.json' if weight_edges else '/random_walks.json'))
Ejemplo n.º 2
0
def graph_propagate(embeddings, positive_seeds, negative_seeds, **kwargs):
    """
    Graph propagation method dapted from Velikovich, Leonid, et al. "The viability of web-derived polarity lexicons."
    http://www.aclweb.org/anthology/N10-1119
    Should be used with arccos=True
    """
    def run_graph_propagate(seeds, alpha_mat, trans_mat, T=1, **kwargs):
        def get_rel_edges(ind_set):
            rel_edges = set([])
            for node in ind_set:
                rel_edges = rel_edges.union([
                    (node, other) for other in trans_mat[node, :].nonzero()[1]
                ])
            return rel_edges

        for seed in seeds:
            F = set([seed])
            for t in range(T):
                for edge in get_rel_edges(F):
                    alpha_mat[seed, edge[1]] = max(
                        alpha_mat[seed, edge[1]],
                        alpha_mat[seed, edge[0]] * trans_mat[edge[0], edge[1]])
                    F.add(edge[1])
        return alpha_mat

    M = similarity_matrix(embeddings, **kwargs)
    M = (M + M.T) / 2
    print("Getting positive scores..")
    pos_alpha = M.copy()
    neg_alpha = M.copy()
    M = csr_matrix(M)
    pos_alpha = run_graph_propagate(
        [embeddings.wi[seed] for seed in positive_seeds], pos_alpha, M,
        **kwargs)
    pos_alpha = pos_alpha + pos_alpha.T
    print("Getting negative scores..")
    neg_alpha = run_graph_propagate(
        [embeddings.wi[seed] for seed in negative_seeds], neg_alpha, M,
        **kwargs)
    neg_alpha = neg_alpha + neg_alpha.T
    print("Computing final scores...")
    polarities = {}
    index = embeddings.wi
    pos_pols = {w: 1.0 for w in positive_seeds}
    for w in negative_seeds:
        pos_pols[w] = 0.0
    neg_pols = {w: 1.0 for w in negative_seeds}
    for w in positive_seeds:
        neg_pols[w] = 0.0
    for w in util.logged_loop(index):
        if w not in positive_seeds and w not in negative_seeds:
            pos_pols[w] = sum(pos_alpha[index[w], index[seed]]
                              for seed in positive_seeds if seed in index)
            neg_pols[w] = sum(neg_alpha[index[w], index[seed]]
                              for seed in negative_seeds if seed in index)
    beta = np.sum(list(pos_pols.values())) / np.sum(list(neg_pols.values()))
    for w in index:
        polarities[w] = pos_pols[w] - beta * neg_pols[w]
    return polarities
Ejemplo n.º 3
0
def run_random_walks(data_dir, weight_edges=False):
    print("Loading data and building transition matrix...")
    examples = util.load_json('./data/' + data_dir +
                              '/oag_examples_simple.json')
    G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int)

    # Get all nodes, but not the edges(those need to be predicted)
    with open('./data/nid_to_id.txt', 'r') as file:
        line = file.readline()
        while line:
            keys = line.split()
            if keys[0] not in G:
                G.add_node(keys[0])
            line = file.readline()

    # Real id to substitute id
    #id_map = {}
    #count = 0
    #for n in G:
    #    id_map[n] = count
    #    count += 1

    #if weight_edges:
    #    reviews = util.load_json('./data/' + data_dir + '/review.json')
    #    end_date = datetime.date(2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1)
    #    edges = G.edges()
    #    for e in util.logged_loop(edges, util.LoopLogger(20000, len(edges), True)):
    #        n1, n2 = str(e[0]), str(e[1])
    #        if n1 not in reviews or n2 not in reviews[n1]:
    #            n1, n2 = n2, n1
    #        G[e[0]][e[1]]['weight'] = 1.0 / ((end_date - get_date(reviews[n1][n2][0])).days + 90)
    #    del reviews  # save some memory

    adjacency_matrix = nx.adjacency_matrix(G)
    inverse_degree_matrix = sparse.diags([[
        1.0 / adjacency_matrix.getrow(i).sum()
        for i in range(adjacency_matrix.shape[0])
    ]], [0])
    transition_matrix = inverse_degree_matrix.dot(adjacency_matrix)

    print("Running random walks...")
    for u in util.logged_loop(examples,
                              util.LoopLogger(10, len(examples), True)):
        p = run_random_walk(transition_matrix, int(u),
                            10).todense()  #row for adj matrix
        for b in examples[u]:
            examples[u][b] = p[0, int(b)]

    util.write_json(
        examples,
        './data/' + data_dir + ('/oag_weighted_random_walks.json'
                                if weight_edges else '/oag_random_walks.json'))
Ejemplo n.º 4
0
def run_random_walks(data_dir, weight_edges=False):
    print "Loading data and building transition matrix..."
    examples = util.load_json('./data/' + data_dir + '/examples.json')
    G = nx.read_edgelist('./data/' + data_dir + '/graph.txt', nodetype=int)
    if weight_edges:
        reviews = util.load_json('./data/' + data_dir + '/review.json')
        end_date = datetime.date(
            2012, 1, 1) if data_dir == 'train' else datetime.date(2013, 1, 1)
        edges = G.edges()
        for e in util.logged_loop(edges,
                                  util.LoopLogger(20000, len(edges), True)):
            n1, n2 = str(e[0]), str(e[1])
            if n1 not in reviews or n2 not in reviews[n1]:
                n1, n2 = n2, n1
            G[e[0]][e[1]]['weight'] = 1.0 / (
                (end_date - get_date(reviews[n1][n2][0])).days + 90)
        del reviews  # save some memory

    adjacency_matrix = nx.adjacency_matrix(G)
    inverse_degree_matrix = sparse.diags([[
        1.0 / adjacency_matrix.getrow(i).sum()
        for i in range(adjacency_matrix.shape[0])
    ]], [0])
    transition_matrix = inverse_degree_matrix.dot(adjacency_matrix)

    print "Running random walks..."
    for u in util.logged_loop(examples,
                              util.LoopLogger(10, len(examples), True)):
        p = run_random_walk(transition_matrix, int(u), 10).todense()
        for b in examples[u]:
            examples[u][b] = p[0, int(b)]

    util.write_json(
        examples,
        './data/' + data_dir + ('/weighted_random_walks.json'
                                if weight_edges else '/random_walks.json'))
Ejemplo n.º 5
0
def write_probable_pairs(dataset_name, action_space_path, scores):
    probable_pairs = {}
    margin_removals = 0
    total_pairs = 0
    total_size = 0
    for did in util.logged_loop(scores):
        doc_scores = scores[did]
        pairs = sorted([pair for pair in doc_scores.keys() if pair[0] != -1],
                       key=lambda pr: doc_scores[pr] - (-1 - 0.3*doc_scores[(-1, pr[1])]),
                       reverse=True)

        total_pairs += len(pairs)
        probable_pairs[did] = []
        for pair in pairs:
            score = doc_scores[pair] - (-1 - 0.3*doc_scores[(-1, pair[1])])
            if score < SCORE_THRESHOLD:
                break
            probable_pairs[did].append(pair)

        max_scores = {}
        for pair in probable_pairs[did]:
            if pair[1] not in max_scores:
                max_scores[pair[1]] = max(doc_scores[pair], -1 - 0.3*doc_scores[(-1, pair[1])])
            else:
                max_scores[pair[1]] = max(max_scores[pair[1]], doc_scores[pair])
        margin_removals += len(probable_pairs[did])
        probable_pairs[did] = [p for p in probable_pairs[did] if
                               doc_scores[p] - max_scores[p[1]] > MARGIN_THRESHOLD]
        margin_removals -= len(probable_pairs[did])
        total_size += len(probable_pairs[did])

    print "num docs:", len(scores)
    print "avg size without filter: {:.1f}".format(total_pairs / float(len(scores)))
    print "avg size: {:.1f}".format(total_size / float(len(scores)))
    print "margin removals size: {:.1f}".format(margin_removals / float(len(scores)))
    util.write_pickle(probable_pairs, action_space_path + dataset_name + '_probable_pairs.pkl')
    shutil.copyfile('clustering_preprocessing.py',
                    action_space_path + 'clustering_preprocessing.py')
def make_examples(data_dir, n_users=5000, min_degree=1, negative_sample_rate=0.01,
                  min_active_time=None, new_edge_only=False):
    print "Loading data..."
    # TODO: switch to networkx?
    G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1)
    with open(data_dir + 'new_edges.txt') as f:
        edges = {tuple(map(int, line.split())) for line in f}
    new_edge_count = Counter()
    for (u, b) in edges:
        new_edge_count[u] += 1
    review_data = util.load_json(data_dir + 'review.json')
    n_businesses = len(util.load_json(data_dir + "business.json"))

    recently_active_users = []
    other_users = []
    print "Getting candidate set of users..."
    users = []
    for Node in util.logged_loop(G.Nodes(), util.LoopLogger(50000, G.GetNodes(), True)):
        u = Node.GetId()
        if new_edge_only and not u in new_edge_count:
            continue
        if str(u) not in review_data or Node.GetOutDeg() < min_degree:
            continue
        if min_active_time:
            recent_review = False
            for b in review_data[str(u)]:
                if (int(u), int(b)) in edges:
                    continue
                for r in review_data[str(u)][b]:
                    if get_date(r) > min_active_time:
                        users.append(u)
                        recently_active_users.append(u)
                        recent_review = True
                        break
                if recent_review:
                    break
            if not recent_review:
                other_users.append(u)
        else:
            users.append(u)

    if min_active_time:
        recent_positive = sum(new_edge_count[u] for u in recently_active_users)
        recent_examples = len(recently_active_users) * n_businesses
        other_positive = sum(new_edge_count[u] for u in other_users)
        other_examples = len(other_users) * n_businesses
        print "Positives retained from recently active filter:", \
            recent_positive / float(recent_positive + other_positive)
        print "Negatives retained from recently active filter:", \
            (recent_examples - recent_positive) / \
            float(recent_examples - recent_positive + other_examples - other_positive)

    random.seed(0)
    users = random.sample(users, n_users)

    print "Getting candidate set of edges..."
    examples = defaultdict(dict)
    for u in util.logged_loop(users, util.LoopLogger(50, n_users, True)):
        candidate_businesses = snap.TIntV()
        snap.GetNodesAtHop(G, u, 3, candidate_businesses, True)
        for b in candidate_businesses:
            if (u, b) in edges:
                examples[u][b] = 1
            elif random.random() < negative_sample_rate:
                examples[u][b] = 0

    hop3_positives = 0
    for u in examples:
        for b in examples[u]:
            hop3_positives += examples[u][b]
    hop3_examples = sum(len(examples[u]) for u in examples)
    n_positives = sum([new_edge_count[u] for u in users])
    n_examples = len(users) * n_businesses
    print "Positives retained from hop3 filter:", hop3_positives / float(n_positives)
    print "Negatives retained from hop3 filter:", (hop3_examples - hop3_positives) / \
            (negative_sample_rate * float(n_examples - n_positives))
    print "Data skew:", hop3_positives / float(hop3_examples)

    print "Writing examples..."
    util.write_json(examples, data_dir + 'examples.json')
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'):
    return util.logged_loop(util.load_json_lines(path),
                            util.LoopLogger(100000, util.lines_in_file(path), True))
Ejemplo n.º 8
0
def make_examples(data_dir,
                  n_users=5000,
                  min_degree=1,
                  negative_sample_rate=0.01,
                  min_active_time=None,
                  new_edge_only=False):
    print "Loading data..."
    # TODO: switch to networkx?
    G = snap.LoadEdgeList(snap.PUNGraph, data_dir + 'graph.txt', 0, 1)
    with open(data_dir + 'new_edges.txt') as f:
        edges = {tuple(map(int, line.split())) for line in f}
    new_edge_count = Counter()
    for (u, b) in edges:
        new_edge_count[u] += 1
    review_data = util.load_json(data_dir + 'review.json')
    n_businesses = len(util.load_json(data_dir + "business.json"))

    recently_active_users = []
    other_users = []
    print "Getting candidate set of users..."
    users = []
    for Node in util.logged_loop(G.Nodes(),
                                 util.LoopLogger(50000, G.GetNodes(), True)):
        u = Node.GetId()
        if new_edge_only and not u in new_edge_count:
            continue
        if str(u) not in review_data or Node.GetOutDeg() < min_degree:
            continue
        if min_active_time:
            recent_review = False
            for b in review_data[str(u)]:
                if (int(u), int(b)) in edges:
                    continue
                for r in review_data[str(u)][b]:
                    if get_date(r) > min_active_time:
                        users.append(u)
                        recently_active_users.append(u)
                        recent_review = True
                        break
                if recent_review:
                    break
            if not recent_review:
                other_users.append(u)
        else:
            users.append(u)

    if min_active_time:
        recent_positive = sum(new_edge_count[u] for u in recently_active_users)
        recent_examples = len(recently_active_users) * n_businesses
        other_positive = sum(new_edge_count[u] for u in other_users)
        other_examples = len(other_users) * n_businesses
        print "Positives retained from recently active filter:", \
            recent_positive / float(recent_positive + other_positive)
        print "Negatives retained from recently active filter:", \
            (recent_examples - recent_positive) / \
            float(recent_examples - recent_positive + other_examples - other_positive)

    random.seed(0)
    users = random.sample(users, n_users)

    print "Getting candidate set of edges..."
    examples = defaultdict(dict)
    for u in util.logged_loop(users, util.LoopLogger(50, n_users, True)):
        candidate_businesses = snap.TIntV()
        snap.GetNodesAtHop(G, u, 3, candidate_businesses, True)
        for b in candidate_businesses:
            if (u, b) in edges:
                examples[u][b] = 1
            elif random.random() < negative_sample_rate:
                examples[u][b] = 0

    hop3_positives = 0
    for u in examples:
        for b in examples[u]:
            hop3_positives += examples[u][b]
    hop3_examples = sum(len(examples[u]) for u in examples)
    n_positives = sum([new_edge_count[u] for u in users])
    n_examples = len(users) * n_businesses
    print "Positives retained from hop3 filter:", hop3_positives / float(
        n_positives)
    print "Negatives retained from hop3 filter:", (hop3_examples - hop3_positives) / \
            (negative_sample_rate * float(n_examples - n_positives))
    print "Data skew:", hop3_positives / float(hop3_examples)

    print "Writing examples..."
    util.write_json(examples, data_dir + 'examples.json')
Ejemplo n.º 9
0
def reviews_iterator(path='./data/provided/yelp_academic_dataset_review.json'):
    return util.logged_loop(
        util.load_json_lines(path),
        util.LoopLogger(100000, util.lines_in_file(path), True))