Example #1
0
def seed_sentcats():
    """
    For sentences in Sentences, categorize using multilabel classifier
    Add results to SentCats -- Initial seeding version
    """
    # select all sentences from Sentences table
    results = db.session.query(Sentence).offset(0).all()
    # for each sentence, categorize with classifier
    for sentence in results:
        sent_id = sentence.sent_id
        text = sentence.sent_text
        predictions = categorize_text(text)
        for cat in predictions:
            # for 'gltn', perform sentiment analysis
            if cat == 'gltn':
                # note: predict_sentiment components revived in function
                sentiment_score = predict_sentiment([text])
                # store prediction_list[0][0][2] (decision_function score) as sen_score
                sen_score = sentiment_score[0][2]
                # query db to check for entry
                sentcat = SentenceCategory.query.filter(
                    SentenceCategory.sent_id == sent_id).first()
                if not sentcat:
                    sentcat = SentenceCategory(sent_id=sent_id,
                                               cat_code='gltn',
                                               sen_score=sen_score)
                else:
                    sentcat.sen_score = sen_score
            else:
                # query db to check for entry
                sentcat = SentenceCategory.query.filter(
                    SentenceCategory.sent_id == sent_id).first()
                if not sentcat:
                    # TODO: will have to perform sentiment analysis and update later
                    sentcat = SentenceCategory(sent_id=sent_id, cat_code=cat)
                else:
                    pass
            db.session.add(sentcat)
        db.session.commit()
    return
Example #2
0
def seed_sentcats():
    """
    For sentences in Sentences, categorize using multilabel classifier
    Add results to SentCats -- Initial seeding version
    """
    # select all sentences from Sentences table
    results = db.session.query(Sentence).offset(0).all()
    # for each sentence, categorize with classifier
    for sentence in results:
        sent_id = sentence.sent_id
        text = sentence.sent_text
        predictions = categorize_text(text)
        for cat in predictions:
            # for 'gltn', perform sentiment analysis
            if cat == 'gltn':
                # note: predict_sentiment components revived in function
                sentiment_score = predict_sentiment([text])
                # store prediction_list[0][0][2] (decision_function score) as sen_score
                sen_score = sentiment_score[0][2]
                # query db to check for entry
                sentcat = SentenceCategory.query.filter(SentenceCategory.sent_id==sent_id).first()
                if not sentcat:
                    sentcat = SentenceCategory(sent_id=sent_id,
                                               cat_code='gltn',
                                               sen_score=sen_score)
                else:
                    sentcat.sen_score=sen_score
            else:
                # query db to check for entry
                sentcat = SentenceCategory.query.filter(SentenceCategory.sent_id==sent_id).first()
                if not sentcat:
                # TODO: will have to perform sentiment analysis and update later
                    sentcat = SentenceCategory(sent_id=sent_id,
                                               cat_code=cat
                                               )
                else:
                    pass
            db.session.add(sentcat)
        db.session.commit()
    return
Example #3
0
def update_sentcat_score(cat_code, search_term):
    """Replace hand-built sentiment score with text-processing API score"""
    # checking progress of update_sentcat_score('vgan', 'vegan')
    #  sqlite> select sentences.sent_text, sentcats.sentcat_id, sentcats.sen_score from sentences
    # ...> left join sentcats on sentcats.sent_id = sentences.sent_id
    # ...> where sentcats.cat_code = 'vgan'
    # ...> limit 10;

    url = "http://text-processing.com/api/sentiment/"

    updated_cat_codes = ['gltn', 'algy']
    # get all sentences containing search term
    sentences = Sentence.query.filter(
        Sentence.sent_text.like('%' + search_term + '%')).all()

    # get inverse sentences and set sen_score = 0
    # ! check this ! sentences = SentenceCategory.query.outerjoin(Sentence).filter((not_(Sentence.sent_text.like('%gluten%'))) | (not_(Sentence.sent_text.like('%celiac%')))).all()
    for sentence in sentences:
        # query text-processing API for sentiment score
        doc = sentence.sent_text
        payload = {'text': doc}

        # make API call
        r = requests.post(url, data=payload)

        # load JSON from API call
        result = json.loads(r.text)

        # pull sentiment score
        sen_score = result['probability']['pos']

        # check if sentence is in sentcat
        result = SentenceCategory.query.filter(
            SentenceCategory.sent_id == sentence.sent_id).one()
        if result:
            # don't update gltn reviews again
            if result.cat_code not in updated_cat_codes:
                # update sen_score
                result.sen_score = sen_score
        else:
            # add sentence to sentcat
            sentcat = SentenceCategory(sent_id=sentence.sent_id,
                                       cat_code=cat_code,
                                       sen_score=sen_score)
        # sentence.sen_score = 0
        db.session.commit()

        # wait 5 seconds before making the next call
        time.sleep(random.randint(0, 10))
    return
Example #4
0
def seed_keyword_revcat(search_term, cat_code):
    """Add more revcats by using like '%vegan%'

    Tested on all reviews containing 'vegan' where reviews.biz_id=148, then
    reran for all reviews containing 'vegan' where reviews.biz_id != 148
    """
    #  sqlite> select reviews.review_id, revcats.revcat_id, sentences.sent_id from reviews
    # ...> LEFT JOIN revcats ON revcats.review_id = reviews.review_id
    # ...> LEFT JOIN sentences on sentences.review_id = reviews.review_id
    # ...> WHERE reviews.biz_id = 148 and reviews.text like '%vegan%';

    # sqlite> select count(*) from reviews where reviews.biz_id != 148 and reviews.text like '%vegan%';
    # count(*)
    # 3946

    # query db for all reviews containing the word 'vegan'
    # search_term = 'vegan'
    reviews = db.session.query(PlatePalReview, ReviewCategory, Sentence,
                               SentenceCategory)
    reviews_joined = reviews.outerjoin(ReviewCategory).outerjoin(
        Sentence).outerjoin(SentenceCategory)
    keyword_reviews = reviews_joined.filter(
        PlatePalReview.text.like(('%' + search_term + '%')))
    # vegan_reviews = reviews_joined.filter(PlatePalReview.biz_id!=148, PlatePalReview.review_id!=7617, PlatePalReview.text.like(('%'+search_term+'%')))

    # instantiate preprocessor for splitting text into sentences
    preprocessor = PennTreebankPunkt(use_flag="sentences")

    for group in keyword_reviews:
        review = group[0]
        revcats = group[1]
        sentences = group[2]
        sentcats = group[3]

        # check if review has revcats
        if not revcats:
            # get sentiment score of review
            sen_score = get_sentiment(review.text)
            # add review to revcat 'vgan'
            revcat = ReviewCategory(
                review_id=review.review_id,
                biz_id=review.biz_id,
                cat_code=cat_code,
                sen_score=sen_score,
            )
            db.session.add(revcat)
            db.session.commit()
        else:  # there are revcats
            pass

        # check if review has sentences
        if not sentences:
            # tokenize into sentences and add to sentences
            sentence_list = preprocessor(review.text)
            # add sentence to Sentences table
            for sentence in sentence_list:
                sent = Sentence(review_id=review.review_id, sent_text=sentence)
                db.session.add(sent)
                db.session.commit()
                # add sentences containing search_term to sentcats
                if search_term in sentence:
                    sent_id = db.session.query(Sentence.sent_id).filter(
                        Sentence.sent_text == sentence,
                        Sentence.review_id == review.review_id).all()
                    if sent_id:
                        for sid in sent_id:
                            # import pdb; pdb.set_trace()
                            # get sentiment score of sentence
                            sen_score = get_sentiment(sentence)
                            sentcat = SentenceCategory(sent_id=sid[0],
                                                       cat_code=cat_code,
                                                       sen_score=sen_score)
                            db.session.add(sentcat)
                            db.session.commit()
                else:
                    pass
        else:  #there are sentences, so check if sentences containing search_term have sentcats
            if not sentcats:
                # check if more than one sentence
                if isinstance(type(sentences), list):
                    for sentence in sentences:
                        if search_term in sentence.text:

                            sent_id = db.session.query(
                                Sentence.sent_id).filter(
                                    Sentence.sent_text == sentence,
                                    Sentence.review_id ==
                                    review.review_id).all()
                            if sent_id:
                                for sid in sent_id:
                                    # get sentiment score of sentence TODO fix
                                    sen_score = get_sentiment(
                                        sentence.sent_text)
                                    sentcat = SentenceCategory(
                                        sent_id=sid[0],
                                        cat_code=cat_code,
                                        sen_score=sen_score)
                                    db.session.add(sentcat)
                                    db.session.commit()
                else:  # single sentence in sentences
                    sentence = sentences
                    if search_term in sentence.sent_text:
                        # get sentiment score of sentence
                        sen_score = get_sentiment(sentence.sent_text)
                        sentcat = SentenceCategory(sent_id=sentence.sent_id,
                                                   cat_code=cat_code,
                                                   sen_score=sen_score)
                        db.session.add(sentcat)
                        db.session.commit()
            else:  # there are sencats ... make sure cat_code matches sencat.cat_code
                # check if sentiment score exists
                if isinstance(type(sentcats), list):
                    for sentcat in sentcats:
                        if sentcat.cat_code == cat_code:
                            sentence_text = db.session.query(
                                Sentence.sent_text).filter(
                                    Sentence.sent_id == sentcat.sent_id).one()
                            if search_term in sentence_text:
                                if not sentcat.sen_score:
                                    # get sentiment score of sentence
                                    sen_score = get_sentiment(sentence_text)
                                    sentcat.sen_score = sen_score
                                    db.session.add(sentcat)
                                    db.session.commit()
                                elif sentcat.sen_score == 0:
                                    # get sentiment score of sentence
                                    sen_score = get_sentiment(sentence_text)
                                    sentcat.sen_score = sen_score
                                    db.session.add(sentcat)
                                    db.session.commit()
                                else:  # there is a non-zero sentiment score
                                    print "sentiment score exists for sentcat %d", sentcat.sentcat_id
                        else:  # sentcat.cat_code != cat_code
                            pass
                else:  #single sentcat
                    sentcat = sentcats
                    if sentcat.cat_code == cat_code:
                        sentence_text = db.session.query(
                            Sentence.sent_text).filter(
                                Sentence.sent_id == sentcat.sent_id).one()
                        if search_term in sentence_text:
                            if not sentcat.sen_score:
                                # get sentiment score of sentence
                                sen_score = get_sentiment(sentence.sent_text)
                                sentcat.sen_score = sen_score
                                db.session.add(sentcat)
                                db.session.commit()
                            elif sentcat.sen_score == 0:
                                # get sentiment score of sentence
                                sen_score = get_sentiment(sentence.sent_text)
                                sentcat.sen_score = sen_score
                                db.session.add(sentcat)
                                db.session.commit()
                            else:  # there is a non-zero sentiment score
                                print "sentiment score exists for sentcat %d", sentcat.sentcat_id
                    else:  #sentcat.cat_code != cat_code
                        pass
    return
Example #5
0
def seed_keyword_revcat(search_term, cat_code):
    """Add more revcats by using like '%vegan%'

    Tested on all reviews containing 'vegan' where reviews.biz_id=148, then
    reran for all reviews containing 'vegan' where reviews.biz_id != 148
    """
    #  sqlite> select reviews.review_id, revcats.revcat_id, sentences.sent_id from reviews
    # ...> LEFT JOIN revcats ON revcats.review_id = reviews.review_id
    # ...> LEFT JOIN sentences on sentences.review_id = reviews.review_id
    # ...> WHERE reviews.biz_id = 148 and reviews.text like '%vegan%';

    # sqlite> select count(*) from reviews where reviews.biz_id != 148 and reviews.text like '%vegan%';
    # count(*)
    # 3946

    # query db for all reviews containing the word 'vegan'
    # search_term = 'vegan'
    reviews = db.session.query(PlatePalReview, ReviewCategory, Sentence, SentenceCategory)
    reviews_joined = reviews.outerjoin(ReviewCategory).outerjoin(Sentence).outerjoin(SentenceCategory)
    keyword_reviews = reviews_joined.filter(PlatePalReview.text.like(('%'+search_term+'%')))
    # vegan_reviews = reviews_joined.filter(PlatePalReview.biz_id!=148, PlatePalReview.review_id!=7617, PlatePalReview.text.like(('%'+search_term+'%')))

    # instantiate preprocessor for splitting text into sentences
    preprocessor = PennTreebankPunkt(use_flag="sentences")

    for group in keyword_reviews:
        review = group[0]
        revcats = group[1]
        sentences = group[2]
        sentcats = group[3]

        # check if review has revcats
        if not revcats:
            # get sentiment score of review
            sen_score = get_sentiment(review.text)
            # add review to revcat 'vgan'
            revcat = ReviewCategory(review_id=review.review_id,
                                    biz_id=review.biz_id,
                                    cat_code=cat_code,
                                    sen_score=sen_score,
                                    )
            db.session.add(revcat)
            db.session.commit()
        else: # there are revcats
            pass

        # check if review has sentences
        if not sentences:
            # tokenize into sentences and add to sentences
            sentence_list = preprocessor(review.text)
            # add sentence to Sentences table
            for sentence in sentence_list:
                sent = Sentence(review_id=review.review_id,
                                sent_text=sentence
                                )
                db.session.add(sent)
                db.session.commit()
                # add sentences containing search_term to sentcats
                if search_term in sentence:
                    sent_id = db.session.query(Sentence.sent_id).filter(Sentence.sent_text==sentence, Sentence.review_id==review.review_id).all()
                    if sent_id:
                        for sid in sent_id:
                            # import pdb; pdb.set_trace()
                            # get sentiment score of sentence
                            sen_score = get_sentiment(sentence)
                            sentcat = SentenceCategory(sent_id=sid[0],
                                                       cat_code=cat_code,
                                                       sen_score=sen_score)
                            db.session.add(sentcat)
                            db.session.commit()
                else:
                    pass
        else: #there are sentences, so check if sentences containing search_term have sentcats
            if not sentcats:
                # check if more than one sentence
                if isinstance(type(sentences), list):
                    for sentence in sentences:
                        if search_term in sentence.text:

                            sent_id = db.session.query(Sentence.sent_id).filter(Sentence.sent_text==sentence, Sentence.review_id==review.review_id).all()
                            if sent_id:
                                for sid in sent_id:
                                    # get sentiment score of sentence TODO fix
                                    sen_score = get_sentiment(sentence.sent_text)
                                    sentcat = SentenceCategory(sent_id=sid[0],
                                                               cat_code=cat_code,
                                                               sen_score=sen_score)
                                    db.session.add(sentcat)
                                    db.session.commit()
                else: # single sentence in sentences
                    sentence = sentences
                    if search_term in sentence.sent_text:
                        # get sentiment score of sentence
                        sen_score = get_sentiment(sentence.sent_text)
                        sentcat = SentenceCategory(sent_id=sentence.sent_id,
                                                   cat_code=cat_code,
                                                   sen_score=sen_score)
                        db.session.add(sentcat)
                        db.session.commit()
            else: # there are sencats ... make sure cat_code matches sencat.cat_code
                # check if sentiment score exists
                if isinstance(type(sentcats), list):
                    for sentcat in sentcats:
                        if sentcat.cat_code == cat_code:
                            sentence_text = db.session.query(Sentence.sent_text).filter(Sentence.sent_id==sentcat.sent_id).one()
                            if search_term in sentence_text:
                                if not sentcat.sen_score:
                                    # get sentiment score of sentence
                                    sen_score = get_sentiment(sentence_text)
                                    sentcat.sen_score = sen_score
                                    db.session.add(sentcat)
                                    db.session.commit()
                                elif sentcat.sen_score == 0:
                                    # get sentiment score of sentence
                                    sen_score = get_sentiment(sentence_text)
                                    sentcat.sen_score = sen_score
                                    db.session.add(sentcat)
                                    db.session.commit()
                                else: # there is a non-zero sentiment score
                                    print "sentiment score exists for sentcat %d", sentcat.sentcat_id
                        else: # sentcat.cat_code != cat_code
                            pass
                else: #single sentcat
                    sentcat = sentcats
                    if sentcat.cat_code == cat_code:
                        sentence_text = db.session.query(Sentence.sent_text).filter(Sentence.sent_id==sentcat.sent_id).one()
                        if search_term in sentence_text:
                            if not sentcat.sen_score:
                                # get sentiment score of sentence
                                sen_score = get_sentiment(sentence.sent_text)
                                sentcat.sen_score = sen_score
                                db.session.add(sentcat)
                                db.session.commit()
                            elif sentcat.sen_score == 0:
                                # get sentiment score of sentence
                                sen_score = get_sentiment(sentence.sent_text)
                                sentcat.sen_score = sen_score
                                db.session.add(sentcat)
                                db.session.commit()
                            else: # there is a non-zero sentiment score
                                print "sentiment score exists for sentcat %d", sentcat.sentcat_id
                    else: #sentcat.cat_code != cat_code
                        pass
    return