Ejemplo n.º 1
0
def copy_db(biz_id, cid):
    '''
    Similar functionality to select_db, except we do the on-the-fly loading for
    the whole dataset.
    '''
    if biz_id == 'all':
        return ReviewDB.load(all_clusters_path, all_centroids_path)
    else:
        db_biz = ReviewDB.load(hotel_attr_path(biz_id), hotel_centroids_path(biz_id))
        return db_biz
Ejemplo n.º 2
0
def cluster_topwords():
    # detail['topwords'] = tfidf_model.top_k(cid, 5)
    biz_id = request.args.get('biz_id')
    cid1 = request.args.get('cid1')
    cid2 = request.args.get('cid2')
    ngramsize = int(request.args.get('ngramsize'))
    #fixed = bool(int(request.args.get('fixed')))
    fixed=False
    cur_tfidf_model = tfidf_model
    if ngramsize == 2:
        cur_tfidf_model = tfidf_model_2g
    if biz_id != 'all':
        db_biz = ReviewDB.load(cluster_file = hotel_attr_path(biz_id))
        cur_tfidf_model = TfidfModel.TFIDFModel(db_biz, ngramsize)

    topwords = None
    # [Xiong] Revise this part to make it more efficient!
    # Right now it's just a simple and dirty hack.
    if cid2 != None:
        if fixed:
            topwords = cur_tfidf_model.compare_fixed_set(cid1, cid2)
        else:
            topwords = [cur_tfidf_model.top_k(cid1), cur_tfidf_model.top_k(cid2)]
    else:
        topwords = [cur_tfidf_model.top_k(cid1)]

    res = Response(json.dumps(topwords), status = 200, mimetype = 'application/json')
    res.headers.add('Access-Control-Allow-Origin', '*')
    return res
Ejemplo n.º 3
0
    def test_nlplength_funcs(self):
        db = ReviewDB.load(cluster_file='tests/testing_db.csv')
        nlp = NLPLengths(db)
        #Test empty set
        empty1 = nlp.word_token_review_length_counter([])
        self.assertEqual(empty1, (Counter(), 0, 0, 0, 0))

        #Test word_token_review_length_counter
        word_result_zero = nlp.word_token_review_length_counter(0)
        print(word_result_zero)
        self.assertEqual(word_result_zero, (Counter({"12": 1}), 12.0, 12, (12, 1), 0.0))
        word_result_cluster = nlp.word_token_review_length_counter('1-2-1-0-0')
        print(word_result_cluster)
        self.assertEqual(word_result_cluster, (Counter({"22": 1, "7": 1, "6": 1}), 11.666666666666666, 7, (6, 1), 8.962886439832502))
        #Test sent_token_review_length_counter
        sent_result_zero = nlp.sent_token_review_length_counter(0)
        print(sent_result_zero)
        self.assertEqual(sent_result_zero, (Counter({"1": 1}), 1.0, 1, (1, 1), 0.0))
        sent_result_cluster = nlp.sent_token_review_length_counter('1-2-1-0-0')
        print(sent_result_cluster)
        self.assertEqual(sent_result_cluster, (Counter({"1": 2, '3':1}), 1.6666666666666667, 1, (1, 2),  1.1547005383792515))
        #Test char_review_length_counter
        char_result_zero = nlp.char_review_length_counter(0)
        print(char_result_zero)
        self.assertEqual(char_result_zero, (Counter({"53": 1}), 53.0, 53, (53,1), 0.0))
        char_result_cluster = nlp.char_review_length_counter('1-2-1-0-0')
        print(char_result_cluster)
        self.assertEqual(char_result_cluster, (Counter({"101": 1, "31": 1, "30": 1}), 54.0, 31, (30, 1), 40.70626487409524))
        #Test Counter behavior when querying using a value not in the keys
        self.assertEqual(sent_result_cluster[0]['0'], 0)
Ejemplo n.º 4
0
 def test_density_estimator(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     nlp = NLPLengths(db)
     histogram_comparison = HistogramComparison()
     char_result_cluster = nlp.char_review_length_counter('1-2-1-0-0')
     histogram = char_result_cluster[0]
     density_estimate = histogram_comparison.density_estimator(histogram)
     self.assertEqual(sum(density_estimate.values()), 1.0)
Ejemplo n.º 5
0
 def test_density_estimator(self):
     db = ReviewDB('tests/test_data/')
     nlp = NLPLengths(db.entity_db_dict['all'])
     histogram_comparison = HistogramComparison()
     char_result_cluster = nlp.char_review_length_counter('1-2-1-0-0')
     histogram = char_result_cluster[0]
     density_estimate = histogram_comparison.density_estimator(histogram)
     self.assertEqual(sum(density_estimate.values()), 1.0)
Ejemplo n.º 6
0
 def test_ReviewDB_init(self):
     '''
         Test initialization of review ids equivalent to original indices as part of
         init for ReviewDB object.
     '''
     clusters_df = pd.read_csv('tests/test_data/clusters.csv', index_col=0)
     db = ReviewDB('tests/test_data/')
     for i in range(0, 10):
         self.assertEqual(
             db.entity_db_dict['all'].clusters_df[
                 db.entity_db_dict['all'].clusters_df['review_id'] ==
                 i].index.item(), i)
Ejemplo n.º 7
0
def select_reviews_by_layer(biz_id, cid):
    '''
    Pick reviews for a certain layer under a certain entity (or all entities)
    Args:
        biz_id: id for locating an entity (e.g. hotel)
        cid: id for locating a cluster
    Returns:
        sub dataframes for the specified cluster
    '''
    if biz_id == 'all':
        return db_all.get_review_from_id(cid)
    else:
        db_biz = ReviewDB.load(hotel_attr_path(biz_id), hotel_centroids_path(biz_id))
        return db_biz.get_review_from_id(cid)
Ejemplo n.º 8
0
def select_db(biz_id):
    '''
    Return a review_db obejct based on biz_id. The reason we have this function
    is to avoid loading a whole dataset on the fly, which can take a lot of time.
    Loading for only one entity is fairly fast.
    Args:
        biz_id: 'all' or actual entity id
    Returns:
        reference to the db object for all reviews or for one specific entity
    '''
    if biz_id == 'all':
        return db_all
    else:
        db_biz = ReviewDB.load(hotel_attr_path(biz_id), hotel_centroids_path(biz_id))
        return db_biz
Ejemplo n.º 9
0
 def test_TFIDF_funcs(self):
     db = ReviewDB('tests/test_data/')
     tfidf = TFIDFModel(db.entity_db_dict['all'])
     #test tfidf.tfidf_score(), which also calls tfidf.scores_to_counter()
     tfidf_zero = tfidf.tfidf_score(0, ['wharf'])
     self.assertTrue('wharf' in tfidf_zero.keys())
     self.assertFalse('banana' in tfidf_zero.keys())
     tfidf_cluster = tfidf.tfidf_score(
         '1-2-1-0-0', ['towels', 'unwelcome', 'charge', 'wharf'])
     self.assertGreater(tfidf_cluster['towels'], 0)
     self.assertGreater(tfidf_cluster['unwelcome'], 0)
     self.assertGreater(tfidf_cluster['charge'], 0)
     self.assertEqual(tfidf_cluster['wharf'], 0.0)
     #test tfidf.top_k(), which also calls tfidf.scores_to_counter()
     top_for_zero = tfidf.top_k(0)
     self.assertTrue('wharf' in top_for_zero.keys())
     top_for_cluster = tfidf.top_k('1-2-1-0-0')
     self.assertTrue('towels' in top_for_cluster.keys())
     self.assertTrue('charge' in top_for_cluster.keys())
     self.assertFalse('wharf' in top_for_cluster.keys())
     #test tfidf.compare_top_k()
     group1, group2 = tfidf.compare_top_k(0, '1-2-1-0-0')
     #test combination of keys
     compare_top_k_test1 = True
     compare_top_k_test2 = True
     #test key values
     compare_top_k_test3 = True
     compare_top_k_test4 = True
     for key in top_for_cluster.keys():
         if key not in group1.keys() or key not in group2.keys():
             compare_top_k_test1 = False
             break
         if group2[key] != top_for_cluster[key]:
             compare_top_k_test3 = False
             break
     for key in top_for_zero.keys():
         if key not in group2.keys() or key not in group1.keys():
             compare_top_k_test2 = False
             break
         if group1[key] != top_for_zero[key]:
             print(key, ' ', group1[key], ' ', top_for_zero[key])
             compare_top_k_test4 = False
             break
     self.assertTrue(compare_top_k_test1)
     self.assertTrue(compare_top_k_test2)
     self.assertTrue(compare_top_k_test3)
     self.assertTrue(compare_top_k_test4)
     self.assertEqual(group2['wharf'], 0.0)
Ejemplo n.º 10
0
 def test_euclidean(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     nlp = NLPLengths(db)
     histogram_comparison = HistogramComparison()
     histogram1 = nlp.char_review_length_counter('1-2-1-0-0')[0]
     compare_self = histogram_comparison.euclidean(histogram1, histogram1)
     self.assertEqual(compare_self, 0.0)
     trivial_histogram1 = Counter({"1": 1})
     trivial_histogram2 = Counter({"1": 0})
     compare_trivial = histogram_comparison.euclidean(
         trivial_histogram1, trivial_histogram2)
     self.assertEqual(compare_trivial, 1.0)
     more_complicated_histogram1 = Counter({"1": 1, "2": 2})
     more_complicated_histogram2 = Counter({"2": 3, "3": 4})
     compare_more_complicated = histogram_comparison.euclidean(
         more_complicated_histogram1, more_complicated_histogram2)
     self.assertLess((compare_more_complicated - 4.24264), .001)
Ejemplo n.º 11
0
 def test_sorensen(self):
     db = ReviewDB('tests/test_data/')
     nlp = NLPLengths(db.entity_db_dict['all'])
     histogram_comparison = HistogramComparison()
     histogram1 = nlp.char_review_length_counter('1-2-1-0-0')[0]
     compare_self = histogram_comparison.sorensen(histogram1, histogram1)
     self.assertEqual(compare_self, 0.0)
     trivial_histogram1 = Counter({"1": 1})
     trivial_histogram2 = Counter({"1": 0})
     compare_trivial = histogram_comparison.sorensen(
         trivial_histogram1, trivial_histogram2)
     self.assertEqual(compare_trivial, 1.0)
     more_complicated_histogram1 = Counter({"1": 1, "2": 2})
     more_complicated_histogram2 = Counter({"2": 3, "3": 4})
     compare_more_complicated = histogram_comparison.sorensen(
         more_complicated_histogram1, more_complicated_histogram2)
     self.assertLess((compare_more_complicated - 0.66667), .001)
Ejemplo n.º 12
0
 def test_hellinger(self):
     db = ReviewDB('tests/test_data/')
     nlp = NLPLengths(db.entity_db_dict['all'])
     histogram_comparison = HistogramComparison()
     histogram1 = nlp.char_review_length_counter('1-2-1-0-0')[0]
     compare_self = histogram_comparison.hellinger(histogram1, histogram1)
     self.assertEqual(compare_self, 0.0)
     trivial_histogram1 = Counter({"1": 1})
     trivial_histogram2 = Counter({"1": 0})
     compare_trivial = histogram_comparison.hellinger(
         trivial_histogram1, trivial_histogram2)
     self.assertEqual(compare_trivial, 0.7071067811865475)
     more_complicated_histogram1 = Counter({"1": 1, "2": 2})
     more_complicated_histogram2 = Counter({"2": 3, "3": 4})
     compare_more_complicated = histogram_comparison.hellinger(
         more_complicated_histogram1, more_complicated_histogram2)
     self.assertLess((compare_more_complicated - 0.6822591268536838), .001)
Ejemplo n.º 13
0
from flask import send_from_directory
from flask_cors import CORS
from libs import nlp_length_functions
from libs.histogram_comparisons import HistogramComparison
from libs.review_db import ReviewDB

# logging configurations
logging.basicConfig(format='%(filename)s:%(lineno)d %(message)s')
log = logging.getLogger(__name__)
log.setLevel('INFO')

# set up data access
CONFIG = json.load(open("./../config.json"))
data_folder = os.path.join(os.environ['DATA_DIR'], CONFIG['dataset'])
schema = json.load(open(os.path.join(data_folder, 'schema.json')))['schema']
database = ReviewDB(data_folder)

app = Flask(__name__,
            static_folder='./react-app/build/static',
            template_folder='./react-app/build')
cors = CORS(app)

app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
app.config['TEMPLATES_AUTO_RELOAD'] = True

histogram_comparison_utils = HistogramComparison()


# [Xiong] endpoint for sending static files
@app.route('/data/<path:subpath>')
def data(subpath):
Ejemplo n.º 14
0
 def test_tfidf_bigram(self):
     db = ReviewDB('tests/test_data/')
     tfidf = TFIDFModel(db.entity_db_dict['all'], ngramsize=2)
     tfidf_zero = tfidf.tfidf_score(0, ["wharf rooms"])
     # print(tfidf_zero)
     self.assertTrue(("wharf rooms") in tfidf_zero.keys())
Ejemplo n.º 15
0
 def test_nlplength_init(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     nlp = NLPLengths(db)
     self.assertFalse(nlp is None)
Ejemplo n.º 16
0
 def test_tfidf_bigram(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     tfidf = TFIDFModel(db, ngramsize=2)
     tfidf_zero = tfidf.tfidf_score(0, ["wharf rooms"])
     # print(tfidf_zero)
     self.assertTrue(("wharf rooms") in tfidf_zero.keys())
Ejemplo n.º 17
0
 def test_TFIDF_init(self):
     db = ReviewDB.load(cluster_file='tests/testing_db.csv')
     tfidf = TFIDFModel(db)
     self.assertFalse(tfidf is None)
Ejemplo n.º 18
0
 def test_nlplength_init(self):
     db = ReviewDB('tests/test_data/')
     nlp = NLPLengths(db.entity_db_dict['all'])
     self.assertFalse(nlp is None)
Ejemplo n.º 19
0
 def test_ReviewDB_funcs(self):
     '''
         Test functions in ReviewDB using a toy data set
     '''
     db = ReviewDB('tests/test_data/')
     #Test empty
     empty1 = db.entity_db_dict['all'].decode_id([])
     empty2 = db.entity_db_dict['all'].fetch_reviews([])
     empty3 = db.entity_db_dict['all'].get_review_from_id([])
     self.assertEqual(empty1, None)
     self.assertEqual(empty2, None)
     self.assertEqual(empty3, None)
     #Testing db.decode_id()
     decode1 = db.entity_db_dict['all'].decode_id(0)
     self.assertEqual(decode1, [0])
     decode2 = db.entity_db_dict['all'].decode_id('1-2-1-0-0')
     self.assertEqual(decode2, [1, 2, 7])
     decode3 = db.entity_db_dict['all'].decode_id('1-2-1-0')
     self.assertEqual(decode3, [1, 2, 7])
     decode4 = db.entity_db_dict['all'].decode_id('1-2-1')
     self.assertEqual(decode4, [1, 2, 7])
     decode5 = db.entity_db_dict['all'].decode_id('1-2')
     self.assertEqual(decode5, [1, 2, 7])
     decode6 = db.entity_db_dict['all'].decode_id('4')
     self.assertEqual(decode6, [0, 5, 6, 8])
     #Testing db.fetch_reviews()
     fetch1 = db.entity_db_dict['all'].fetch_reviews([0])
     self.assertEqual(fetch1.iloc[[0]].author.values, 'guest1')
     fetch2 = db.entity_db_dict['all'].fetch_reviews([0, 3, 7])
     self.assertEqual(fetch2.iloc[[0]].author.values, 'guest1')
     self.assertEqual(fetch2.iloc[[1]].author.values, 'guest4')
     self.assertEqual(fetch2.iloc[[2]].author.values, 'guest8')
     #Testing db.get_review_from_id()
     review1 = db.entity_db_dict['all'].get_review_from_id(0)
     self.assertEqual(review1.iloc[[0]].author.values, 'guest1')
     review2 = db.entity_db_dict['all'].get_review_from_id('1-2-1-0-0')
     self.assertEqual(review2.iloc[[0]].author.values, 'guest2')
     self.assertEqual(review2.iloc[[1]].author.values, 'guest3')
     self.assertEqual(review2.iloc[[2]].author.values, 'guest8')
     review3 = db.entity_db_dict['all'].get_review_from_id('1-2-1-0')
     self.assertEqual(review3.iloc[[0]].author.values, 'guest2')
     self.assertEqual(review3.iloc[[1]].author.values, 'guest3')
     self.assertEqual(review3.iloc[[2]].author.values, 'guest8')
     review4 = db.entity_db_dict['all'].get_review_from_id('1-2-1')
     self.assertEqual(review4.iloc[[0]].author.values, 'guest2')
     self.assertEqual(review4.iloc[[1]].author.values, 'guest3')
     self.assertEqual(review4.iloc[[2]].author.values, 'guest8')
     review5 = db.entity_db_dict['all'].get_review_from_id('1-2')
     self.assertEqual(review5.iloc[[0]].author.values, 'guest2')
     self.assertEqual(review5.iloc[[1]].author.values, 'guest3')
     self.assertEqual(review5.iloc[[2]].author.values, 'guest8')
     review6 = db.entity_db_dict['all'].get_review_from_id('1')
     self.assertEqual(review6.iloc[[0]].author.values, 'guest2')
     self.assertEqual(review6.iloc[[1]].author.values, 'guest3')
     self.assertEqual(review6.iloc[[2]].author.values, 'guest8')
     review7 = db.entity_db_dict['all'].get_review_from_id('4')
     self.assertEqual(review7.iloc[[0]].author.values, 'guest1')
     self.assertEqual(review7.iloc[[1]].author.values, 'guest6')
     self.assertEqual(review7.iloc[[2]].author.values, 'guest7')
     self.assertEqual(review7.iloc[[3]].author.values, 'guest9')
     #Test access using "all" code for all reviews
     alltest = db.entity_db_dict['all'].decode_id("all")
     self.assertEqual(alltest, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
Ejemplo n.º 20
0
 def test_TFIDF_init(self):
     db = ReviewDB('tests/test_data/')
     tfidf = TFIDFModel(db.entity_db_dict['all'])
     self.assertFalse(tfidf is None)
Ejemplo n.º 21
0
log = logging.getLogger(__name__)
log.setLevel('INFO')

CONFIG = json.load(open("./../config.json"))
data_folder = os.path.join(os.environ['DATA_DIR'], CONFIG['dataset'])
schema = json.load(open(os.path.join(data_folder, 'schema.json')))['schema']

app = Flask(__name__, static_folder = './react-app/build/static', template_folder = './react-app/build')
app.config['SEND_FILE_MAX_AGE_DEFAULT'] = 0
app.config['TEMPLATES_AUTO_RELOAD'] = True

all_centroids_df = pd.read_csv(os.path.join(data_folder, 'centroids.csv'))
log.info('centroids loaded')
all_clusters_df = pd.read_csv(os.path.join(data_folder, 'clusters.csv'))
log.info('clusters loaded')
db_all = ReviewDB(all_clusters_df, all_centroids_df)
working_df = None

tfidf_model = TfidfModel.TFIDFModel(db_all)
tfidf_model_2g = TfidfModel.TFIDFModel(db_all, 2)

hotel_attr_path = lambda biz_id: os.path.join(data_folder, f'hotel-clusters/{biz_id}/attr.csv')
hotel_centroids_path = lambda biz_id: os.path.join(data_folder, f'hotel-clusters/{biz_id}/centroids.csv')

histogram_comparison_utils = HistogramComparison()

# [Xiong] setups for CORS access. I do this because I test the frontend on
# localhost:3000, while the server runs on localhost:5000. Eventually the CORS
# setup will make it possible for data server and front-end hosting server
# running on different machines --- which may not be necessary though
@app.after_request