def seed_sentcats(): """ For sentences in Sentences, categorize using multilabel classifier Add results to SentCats -- Initial seeding version """ # select all sentences from Sentences table results = db.session.query(Sentence).offset(0).all() # for each sentence, categorize with classifier for sentence in results: sent_id = sentence.sent_id text = sentence.sent_text predictions = categorize_text(text) for cat in predictions: # for 'gltn', perform sentiment analysis if cat == 'gltn': # note: predict_sentiment components revived in function sentiment_score = predict_sentiment([text]) # store prediction_list[0][0][2] (decision_function score) as sen_score sen_score = sentiment_score[0][2] # query db to check for entry sentcat = SentenceCategory.query.filter( SentenceCategory.sent_id == sent_id).first() if not sentcat: sentcat = SentenceCategory(sent_id=sent_id, cat_code='gltn', sen_score=sen_score) else: sentcat.sen_score = sen_score else: # query db to check for entry sentcat = SentenceCategory.query.filter( SentenceCategory.sent_id == sent_id).first() if not sentcat: # TODO: will have to perform sentiment analysis and update later sentcat = SentenceCategory(sent_id=sent_id, cat_code=cat) else: pass db.session.add(sentcat) db.session.commit() return
def seed_sentcats(): """ For sentences in Sentences, categorize using multilabel classifier Add results to SentCats -- Initial seeding version """ # select all sentences from Sentences table results = db.session.query(Sentence).offset(0).all() # for each sentence, categorize with classifier for sentence in results: sent_id = sentence.sent_id text = sentence.sent_text predictions = categorize_text(text) for cat in predictions: # for 'gltn', perform sentiment analysis if cat == 'gltn': # note: predict_sentiment components revived in function sentiment_score = predict_sentiment([text]) # store prediction_list[0][0][2] (decision_function score) as sen_score sen_score = sentiment_score[0][2] # query db to check for entry sentcat = SentenceCategory.query.filter(SentenceCategory.sent_id==sent_id).first() if not sentcat: sentcat = SentenceCategory(sent_id=sent_id, cat_code='gltn', sen_score=sen_score) else: sentcat.sen_score=sen_score else: # query db to check for entry sentcat = SentenceCategory.query.filter(SentenceCategory.sent_id==sent_id).first() if not sentcat: # TODO: will have to perform sentiment analysis and update later sentcat = SentenceCategory(sent_id=sent_id, cat_code=cat ) else: pass db.session.add(sentcat) db.session.commit() return
def update_sentcat_score(cat_code, search_term): """Replace hand-built sentiment score with text-processing API score""" # checking progress of update_sentcat_score('vgan', 'vegan') # sqlite> select sentences.sent_text, sentcats.sentcat_id, sentcats.sen_score from sentences # ...> left join sentcats on sentcats.sent_id = sentences.sent_id # ...> where sentcats.cat_code = 'vgan' # ...> limit 10; url = "http://text-processing.com/api/sentiment/" updated_cat_codes = ['gltn', 'algy'] # get all sentences containing search term sentences = Sentence.query.filter( Sentence.sent_text.like('%' + search_term + '%')).all() # get inverse sentences and set sen_score = 0 # ! check this ! sentences = SentenceCategory.query.outerjoin(Sentence).filter((not_(Sentence.sent_text.like('%gluten%'))) | (not_(Sentence.sent_text.like('%celiac%')))).all() for sentence in sentences: # query text-processing API for sentiment score doc = sentence.sent_text payload = {'text': doc} # make API call r = requests.post(url, data=payload) # load JSON from API call result = json.loads(r.text) # pull sentiment score sen_score = result['probability']['pos'] # check if sentence is in sentcat result = SentenceCategory.query.filter( SentenceCategory.sent_id == sentence.sent_id).one() if result: # don't update gltn reviews again if result.cat_code not in updated_cat_codes: # update sen_score result.sen_score = sen_score else: # add sentence to sentcat sentcat = SentenceCategory(sent_id=sentence.sent_id, cat_code=cat_code, sen_score=sen_score) # sentence.sen_score = 0 db.session.commit() # wait 5 seconds before making the next call time.sleep(random.randint(0, 10)) return
def seed_keyword_revcat(search_term, cat_code): """Add more revcats by using like '%vegan%' Tested on all reviews containing 'vegan' where reviews.biz_id=148, then reran for all reviews containing 'vegan' where reviews.biz_id != 148 """ # sqlite> select reviews.review_id, revcats.revcat_id, sentences.sent_id from reviews # ...> LEFT JOIN revcats ON revcats.review_id = reviews.review_id # ...> LEFT JOIN sentences on sentences.review_id = reviews.review_id # ...> WHERE reviews.biz_id = 148 and reviews.text like '%vegan%'; # sqlite> select count(*) from reviews where reviews.biz_id != 148 and reviews.text like '%vegan%'; # count(*) # 3946 # query db for all reviews containing the word 'vegan' # search_term = 'vegan' reviews = db.session.query(PlatePalReview, ReviewCategory, Sentence, SentenceCategory) reviews_joined = reviews.outerjoin(ReviewCategory).outerjoin( Sentence).outerjoin(SentenceCategory) keyword_reviews = reviews_joined.filter( PlatePalReview.text.like(('%' + search_term + '%'))) # vegan_reviews = reviews_joined.filter(PlatePalReview.biz_id!=148, PlatePalReview.review_id!=7617, PlatePalReview.text.like(('%'+search_term+'%'))) # instantiate preprocessor for splitting text into sentences preprocessor = PennTreebankPunkt(use_flag="sentences") for group in keyword_reviews: review = group[0] revcats = group[1] sentences = group[2] sentcats = group[3] # check if review has revcats if not revcats: # get sentiment score of review sen_score = get_sentiment(review.text) # add review to revcat 'vgan' revcat = ReviewCategory( review_id=review.review_id, biz_id=review.biz_id, cat_code=cat_code, sen_score=sen_score, ) db.session.add(revcat) db.session.commit() else: # there are revcats pass # check if review has sentences if not sentences: # tokenize into sentences and add to sentences sentence_list = preprocessor(review.text) # add sentence to Sentences table for sentence in sentence_list: sent = Sentence(review_id=review.review_id, sent_text=sentence) db.session.add(sent) db.session.commit() # add sentences containing search_term to sentcats if search_term in sentence: sent_id = db.session.query(Sentence.sent_id).filter( Sentence.sent_text == sentence, Sentence.review_id == review.review_id).all() if sent_id: for sid in sent_id: # import pdb; pdb.set_trace() # get sentiment score of sentence sen_score = get_sentiment(sentence) sentcat = SentenceCategory(sent_id=sid[0], cat_code=cat_code, sen_score=sen_score) db.session.add(sentcat) db.session.commit() else: pass else: #there are sentences, so check if sentences containing search_term have sentcats if not sentcats: # check if more than one sentence if isinstance(type(sentences), list): for sentence in sentences: if search_term in sentence.text: sent_id = db.session.query( Sentence.sent_id).filter( Sentence.sent_text == sentence, Sentence.review_id == review.review_id).all() if sent_id: for sid in sent_id: # get sentiment score of sentence TODO fix sen_score = get_sentiment( sentence.sent_text) sentcat = SentenceCategory( sent_id=sid[0], cat_code=cat_code, sen_score=sen_score) db.session.add(sentcat) db.session.commit() else: # single sentence in sentences sentence = sentences if search_term in sentence.sent_text: # get sentiment score of sentence sen_score = get_sentiment(sentence.sent_text) sentcat = SentenceCategory(sent_id=sentence.sent_id, cat_code=cat_code, sen_score=sen_score) db.session.add(sentcat) db.session.commit() else: # there are sencats ... make sure cat_code matches sencat.cat_code # check if sentiment score exists if isinstance(type(sentcats), list): for sentcat in sentcats: if sentcat.cat_code == cat_code: sentence_text = db.session.query( Sentence.sent_text).filter( Sentence.sent_id == sentcat.sent_id).one() if search_term in sentence_text: if not sentcat.sen_score: # get sentiment score of sentence sen_score = get_sentiment(sentence_text) sentcat.sen_score = sen_score db.session.add(sentcat) db.session.commit() elif sentcat.sen_score == 0: # get sentiment score of sentence sen_score = get_sentiment(sentence_text) sentcat.sen_score = sen_score db.session.add(sentcat) db.session.commit() else: # there is a non-zero sentiment score print "sentiment score exists for sentcat %d", sentcat.sentcat_id else: # sentcat.cat_code != cat_code pass else: #single sentcat sentcat = sentcats if sentcat.cat_code == cat_code: sentence_text = db.session.query( Sentence.sent_text).filter( Sentence.sent_id == sentcat.sent_id).one() if search_term in sentence_text: if not sentcat.sen_score: # get sentiment score of sentence sen_score = get_sentiment(sentence.sent_text) sentcat.sen_score = sen_score db.session.add(sentcat) db.session.commit() elif sentcat.sen_score == 0: # get sentiment score of sentence sen_score = get_sentiment(sentence.sent_text) sentcat.sen_score = sen_score db.session.add(sentcat) db.session.commit() else: # there is a non-zero sentiment score print "sentiment score exists for sentcat %d", sentcat.sentcat_id else: #sentcat.cat_code != cat_code pass return
def seed_keyword_revcat(search_term, cat_code): """Add more revcats by using like '%vegan%' Tested on all reviews containing 'vegan' where reviews.biz_id=148, then reran for all reviews containing 'vegan' where reviews.biz_id != 148 """ # sqlite> select reviews.review_id, revcats.revcat_id, sentences.sent_id from reviews # ...> LEFT JOIN revcats ON revcats.review_id = reviews.review_id # ...> LEFT JOIN sentences on sentences.review_id = reviews.review_id # ...> WHERE reviews.biz_id = 148 and reviews.text like '%vegan%'; # sqlite> select count(*) from reviews where reviews.biz_id != 148 and reviews.text like '%vegan%'; # count(*) # 3946 # query db for all reviews containing the word 'vegan' # search_term = 'vegan' reviews = db.session.query(PlatePalReview, ReviewCategory, Sentence, SentenceCategory) reviews_joined = reviews.outerjoin(ReviewCategory).outerjoin(Sentence).outerjoin(SentenceCategory) keyword_reviews = reviews_joined.filter(PlatePalReview.text.like(('%'+search_term+'%'))) # vegan_reviews = reviews_joined.filter(PlatePalReview.biz_id!=148, PlatePalReview.review_id!=7617, PlatePalReview.text.like(('%'+search_term+'%'))) # instantiate preprocessor for splitting text into sentences preprocessor = PennTreebankPunkt(use_flag="sentences") for group in keyword_reviews: review = group[0] revcats = group[1] sentences = group[2] sentcats = group[3] # check if review has revcats if not revcats: # get sentiment score of review sen_score = get_sentiment(review.text) # add review to revcat 'vgan' revcat = ReviewCategory(review_id=review.review_id, biz_id=review.biz_id, cat_code=cat_code, sen_score=sen_score, ) db.session.add(revcat) db.session.commit() else: # there are revcats pass # check if review has sentences if not sentences: # tokenize into sentences and add to sentences sentence_list = preprocessor(review.text) # add sentence to Sentences table for sentence in sentence_list: sent = Sentence(review_id=review.review_id, sent_text=sentence ) db.session.add(sent) db.session.commit() # add sentences containing search_term to sentcats if search_term in sentence: sent_id = db.session.query(Sentence.sent_id).filter(Sentence.sent_text==sentence, Sentence.review_id==review.review_id).all() if sent_id: for sid in sent_id: # import pdb; pdb.set_trace() # get sentiment score of sentence sen_score = get_sentiment(sentence) sentcat = SentenceCategory(sent_id=sid[0], cat_code=cat_code, sen_score=sen_score) db.session.add(sentcat) db.session.commit() else: pass else: #there are sentences, so check if sentences containing search_term have sentcats if not sentcats: # check if more than one sentence if isinstance(type(sentences), list): for sentence in sentences: if search_term in sentence.text: sent_id = db.session.query(Sentence.sent_id).filter(Sentence.sent_text==sentence, Sentence.review_id==review.review_id).all() if sent_id: for sid in sent_id: # get sentiment score of sentence TODO fix sen_score = get_sentiment(sentence.sent_text) sentcat = SentenceCategory(sent_id=sid[0], cat_code=cat_code, sen_score=sen_score) db.session.add(sentcat) db.session.commit() else: # single sentence in sentences sentence = sentences if search_term in sentence.sent_text: # get sentiment score of sentence sen_score = get_sentiment(sentence.sent_text) sentcat = SentenceCategory(sent_id=sentence.sent_id, cat_code=cat_code, sen_score=sen_score) db.session.add(sentcat) db.session.commit() else: # there are sencats ... make sure cat_code matches sencat.cat_code # check if sentiment score exists if isinstance(type(sentcats), list): for sentcat in sentcats: if sentcat.cat_code == cat_code: sentence_text = db.session.query(Sentence.sent_text).filter(Sentence.sent_id==sentcat.sent_id).one() if search_term in sentence_text: if not sentcat.sen_score: # get sentiment score of sentence sen_score = get_sentiment(sentence_text) sentcat.sen_score = sen_score db.session.add(sentcat) db.session.commit() elif sentcat.sen_score == 0: # get sentiment score of sentence sen_score = get_sentiment(sentence_text) sentcat.sen_score = sen_score db.session.add(sentcat) db.session.commit() else: # there is a non-zero sentiment score print "sentiment score exists for sentcat %d", sentcat.sentcat_id else: # sentcat.cat_code != cat_code pass else: #single sentcat sentcat = sentcats if sentcat.cat_code == cat_code: sentence_text = db.session.query(Sentence.sent_text).filter(Sentence.sent_id==sentcat.sent_id).one() if search_term in sentence_text: if not sentcat.sen_score: # get sentiment score of sentence sen_score = get_sentiment(sentence.sent_text) sentcat.sen_score = sen_score db.session.add(sentcat) db.session.commit() elif sentcat.sen_score == 0: # get sentiment score of sentence sen_score = get_sentiment(sentence.sent_text) sentcat.sen_score = sen_score db.session.add(sentcat) db.session.commit() else: # there is a non-zero sentiment score print "sentiment score exists for sentcat %d", sentcat.sentcat_id else: #sentcat.cat_code != cat_code pass return