forked from hughec/yelp_social_sentiment
/
reviewcrf.py
75 lines (67 loc) · 3.91 KB
/
reviewcrf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
import pycrfsuite
import readyelp
import cleanyelp
import random
import numpy
def train_crf(train_reviews, user_dict):
""" Given a dictionary of training reviews and a dictionary of user objects, trains a Linear-chain CRF model to tag pairs of reviews as either members of the same class (["1", "1"]) or different classes (["1", "0"]). Features for the CRF consist of the date and text of each review. """
trainer = pycrfsuite.Trainer('lbfgs')
for train_id in train_reviews:
train_review = train_reviews[train_id]
friend_reviews_of_business = train_review["friend_reviews_of_business"]
for friend_review_id in friend_reviews_of_business:
if friend_review_id not in train_reviews: continue
friend_review = train_reviews[friend_review_id]
if friend_review["date"] <= train_review["date"]:
friend = user_dict[friend_review["user_id"]]
user = user_dict[train_review["user_id"]]
friend_item = {"id":friend_review["user_id"], "friends":len(friend["friends"]), "text":friend_review["text"]}
train_item = {"id":train_review["user_id"], "friends":len(user["friends"]), "text":train_review["text"]}
xseq = [friend_item, train_item]
# If the ratings are the same between reviews, both are labeled as "1" in the Y sequence, otherwise the training review is labeled as "0".
# This allows for the training of a CRF that determines the probability that the latter review receives the same label as the former - the strength of the link between reviews.
if friend_review["rating"] == train_review["rating"]:
yseq = ["1", "1"]
else:
yseq = ["1", "0"]
trainer.append(xseq, yseq)
trainer.train("reviewcrfmodel")
def crftag_probabilities(test_reviews, user_dict):
""" Given a dictionary of reviews for test, tags pairs of reviews of the same business created by two users who are friends. Returns a dictionary mapping tuples (r1, r2) to the probability that those two reviews should receive the same classification. """
probabilities = {}
tagger = pycrfsuite.Tagger()
tagger.open("reviewcrfmodel")
num_same = 0
num_diff = 0
for test_id in test_reviews:
test_review = test_reviews[test_id]
friend_reviews_of_business = test_review["friend_reviews_of_business"]
for friend_review_id in friend_reviews_of_business:
if friend_review_id not in test_reviews: continue
friend_review = test_reviews[friend_review_id]
if friend_review["date"] <= test_review["date"]:
friend = user_dict[friend_review["user_id"]]
user = user_dict[test_review["user_id"]]
friend_item = {"id":friend_review["user_id"], "friends":len(friend["friends"]), "text":friend_review["text"]}
test_item = {"id":test_review["user_id"], "friends":len(user["friends"]), "text":test_review["text"]}
xseq = [friend_item, test_item]
yseq = tagger.tag(xseq)
# The probability of the labeling yseq, given the input xseq
prob_y = tagger.probability(yseq)
if yseq[1] == "1":
probabilities[(friend_review_id, test_id)] = prob_y
num_same += 1
else:
probabilities[(friend_review_id, test_id)] = 1 - prob_y
num_diff += 1
print "Number of same, differnt from CRF:", num_same, num_diff
return probabilities
def main():
train_reviews = readyelp.read_reviews_to_dict("train_reviews.json")
test_reviews = readyelp.read_reviews_to_dict("test_reviews.json")
user_dict = readyelp.read_users_to_dict("users_limited.json")
train_crf(train_reviews, user_dict)
crftag(test_reviews, user_dict)
if __name__ == "__main__":
main()