def Store_Into_MongoDB(dicCrawl, client): for key, value in dicCrawl.items(): MongoHelper.Insert_CSV_Users_Into_MongoDB( Utils.Get_Result_CSV_File(value, Utils.Result_Type_User_Profile), client) MongoHelper.Insert_CSV_Reviews_Into_MongoDB( Utils.Get_Result_CSV_File(value, Utils.Result_Type_Reviews), client)
def Scrap_TouristFunction_Information(touristFunctionDic, touristFunctionType, client): csvFileName = Utils.Get_Result_CSV_File(touristFunctionType, Utils.Result_Type_TouristFunction) dict = {} with open(csvFileName, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow([ Utils.TouristFunction_Name_Key, Utils.TouristFunction_Type_Key, Utils.TouristFunction_Overall_Rating_Key, Utils.TouristFunction_Ranking_Key, Utils.TouristFunction_PriceRange_Key, Utils.TouristFunction_SubType_Key, Utils.TouristFunction_Address_Key, Utils.TouristFunction_Country_Key, Utils.TouristFunction_Postal_Code_Key, Utils.TouristFunction_Phone_Key, Utils.TouristFunction_Websit_Key, Utils.TouristFunction_Email_Key ]) f.close() for key, value in touristFunctionDic.items(): try: websiteUrl = Get_Tourist_Function_Url(touristFunctionType, key) overlayUrl = Utils.WebsiteUrl + '/EmailHotel?detail=' + key.lstrip( 'd') # for hotel email html = requests.get(websiteUrl) soup = BeautifulSoup(html.content, 'html.parser') html2 = requests.get(overlayUrl) soup2 = BeautifulSoup(html2.content, 'html.parser') bizInfo = soup.find( 'div', {'id': Taplc_Location_Detail_Header(touristFunctionType)}) bizName = bizInfo.find('h1').text.strip() bizRating = bizInfo.find( 'span', {'class': 'ui_bubble_rating'})['alt'].split()[0] bizRankB = bizInfo.find('span', { 'class': 'header_popularity' }).find('b') bizRank = '' bizSubType = [] bizPriceRange = '' if bizRankB is not None: bizRankBSpan = bizRankB.find('span') if bizRankBSpan is not None: bizRank = bizRankBSpan.text.lstrip('#') else: bizRank = bizRankB.text.lstrip('#') bizStreetAddress = bizInfo.find('span', { 'class': 'street-address' }).text bizPostalCode = bizInfo.find('span', { 'class': 'locality' }).text.rstrip(', ') bizCountry = bizInfo.find('span', { 'class': 'country-name' }).text.strip() bizPriceRangeClass = bizInfo.find('span', {'class': 'header_tags'}) if bizPriceRangeClass is not None: bizPriceRange = bizPriceRangeClass.text.strip() bizSubTypeClass = bizInfo.find('span', {'class': 'header_links'}) if bizSubTypeClass is not None: bizSubTypeLinkText = bizSubTypeClass.findAll('a') if bizSubTypeLinkText is not None: for linkT in bizSubTypeLinkText: bizSubType.append(linkT.text.strip()) else: bizSubTypeClass = bizInfo.findAll('span', {'class': 'header_detail'}) if bizSubTypeClass is not None: for span in bizSubTypeClass: detailDiv = span.findAll('div', {'class': 'detail'}) if detailDiv is not None: for div in detailDiv: details = div.findAll('a') if details is not None: for detail in details: bizSubType.append(detail.text) bizPhone = '' bizEmail = '' # some tourist function does not list phone number try: bizPhoneObject = bizInfo.find('div', {'class': 'blEntry phone'}) if bizPhoneObject is not None: bizPhoneSpan = bizPhoneObject.findAll('span') if bizPhoneSpan is not None: length = len(bizPhoneSpan) bizPhoneScript = bizPhoneSpan[length - 1].find('script') if bizPhoneScript is not None: bizPhone = bizPhoneScript.text bizPhone = Utils.RejoinPhoneNumber( bizPhone ) # number sequence is scrambled in souce code except Exception as e: logging.exception('error in scrapping phone information : ' + e.__str__()) # some tourist function does not list email try: emailObject = soup2.find('input', {'id': 'receiver'}) if emailObject is not None: bizEmail = emailObject['value'] except Exception as e: logging.exception('error in scrapping email information : ' + e.__str__()) bizWebsite = Query_Tourist_Function_Website_URL(value) with open(csvFileName, 'a', newline='', encoding='utf-8') as f: spamwriter = csv.writer(f) spamwriter.writerow([ bizName, touristFunctionType, bizRating, bizRank, bizPriceRange, bizSubType, bizStreetAddress, bizCountry, bizPostalCode, bizPhone, bizWebsite, bizEmail ]) f.close() dict = { Utils.TouristFunction_Name_Key: bizName, Utils.TouristFunction_Type_Key: touristFunctionType, Utils.TouristFunction_Overall_Rating_Key: bizRating, Utils.TouristFunction_Ranking_Key: bizRank, Utils.TouristFunction_PriceRange_Key: bizPriceRange, Utils.TouristFunction_SubType_Key: bizSubType, Utils.TouristFunction_Address_Key: bizStreetAddress, Utils.TouristFunction_Country_Key: bizCountry, Utils.TouristFunction_Postal_Code_Key: bizPostalCode, Utils.TouristFunction_Phone_Key: bizPhone, Utils.TouristFunction_Websit_Key: bizWebsite, Utils.TouristFunction_Email_Key: bizEmail } MongoHelper.Insert_Data_IntoMongoDB(Utils.TripAdvisor_DB, Utils.TouristFunction_Table, client, dict, Utils.DicTouristFunctionKey, bizName) except Exception as e: logging.exception('error in scrapping tourist function : ' + e.__str__())
def Get_Tourist_Function_Url(touristFunctionType, touristFunctionID): return Utils.Get_Base_Url( touristFunctionType) + touristFunctionID + '-Reviews'
def Crawl_TouristFunction(baseUrl, touristFunctionName, touristFunction_Type, dbClient): reviewCollection = '' userCollection = '' lock = threading.Lock() userWholeList = [] reviewWholeList = [] userRes1 = [] reviewRes1 = [] userRes2 = [] reviewRes2 = [] try: dic = Get_All_Page_Url(baseUrl, touristFunction_Type) allPageUrls = dic[Utils.UrlKey] totalRecord = dic[Utils.TotalNumKey] halfNum = round(len(allPageUrls) / 2) t1 = threading.Thread(target=Scraping_Procedure, args=(allPageUrls[:halfNum], reviewRes1, userRes1, touristFunctionName, touristFunction_Type, lock, dbClient)) t2 = threading.Thread(target=Scraping_Procedure, args=(allPageUrls[halfNum:], reviewRes2, userRes2, touristFunctionName, touristFunction_Type, lock, dbClient)) t1.start() t2.start() t1.join() t2.join() print(touristFunctionName + ' finish') userWholeList.extend(userRes1) userWholeList.extend(userRes2) reviewWholeList.extend(reviewRes1) reviewWholeList.extend(reviewRes2) except Exception as e: logging.exception('Exception : ' + e.__str__()) pass userProfileCSVFile = Utils.Get_Result_CSV_File( touristFunctionName, Utils.Result_Type_User_Profile) reviewCSVFile = Utils.Get_Result_CSV_File(touristFunctionName, Utils.Result_Type_Reviews) with open(userProfileCSVFile, 'w', newline='', encoding='utf-8') as profileCSV: try: profileWriter = csv.writer(profileCSV) profileWriter.writerow([ Utils.DicUserNameKey, Utils.DicBadgeLevelKey, Utils.DicTagsKey, Utils.DicHomeTownKey, Utils.DicAgeSinceKey, Utils.DicAgeRangeKey, Utils.DicGenderKey, Utils.DicUserContributionKey, Utils.dicCitiesVisitedKey, Utils.DicHelpfulVotesKey, Utils.DicPhotosKey, Utils.DicForumPostKey, Utils.DicRatingGivenKey, Utils.DicExcellentNumKey, Utils.DicVeryGoodNumKey, Utils.DicAverageNumKey, Utils.DicPoorNumKey, Utils.DicTerribleNumKey ]) num = len(userWholeList) for i in range(num): user = userWholeList[i][Utils.DicUserNameKey] badge = userWholeList[i][Utils.DicBadgeLevelKey] hometown = userWholeList[i][Utils.DicHomeTownKey] membership = userWholeList[i][Utils.DicAgeSinceKey] ageValue = userWholeList[i][Utils.DicAgeRangeKey] genderValue = userWholeList[i][Utils.DicGenderKey] userContribution = userWholeList[i][ Utils.DicUserContributionKey] cityVisit = userWholeList[i][Utils.dicCitiesVisitedKey] helpfulV = userWholeList[i][Utils.DicHelpfulVotesKey] postPhotos = userWholeList[i][Utils.DicPhotosKey] forumPosted = userWholeList[i][Utils.DicForumPostKey] ratingGivenN = userWholeList[i][Utils.DicRatingGivenKey] excellent = userWholeList[i][Utils.DicExcellentNumKey] veryGood = userWholeList[i][Utils.DicVeryGoodNumKey] average = userWholeList[i][Utils.DicAverageNumKey] poor = userWholeList[i][Utils.DicPoorNumKey] terrible = userWholeList[i][Utils.DicTerribleNumKey] tags = userWholeList[i][Utils.DicTagsKey] profileWriter.writerow([ user, badge, tags, hometown, membership, ageValue, genderValue, userContribution, cityVisit, helpfulV, postPhotos, forumPosted, ratingGivenN, excellent, veryGood, average, poor, terrible ]) profileCSV.close() except: logging.exception('Fail to insert data to ' + userProfileCSVFile) pass with open(reviewCSVFile, 'w', newline='', encoding='utf-8') as csvfile: try: spamwriter = csv.writer(csvfile) spamwriter.writerow([ Utils.DicUserNameKey, Utils.DicTouristFunctionKey, Utils.DicReviewDateKey, Utils.DicTitleKey, Utils.DicCoomentKey, Utils.DicCommentLangKey, Utils.DicSentimentKey, Utils.DicKeyPhrasesKey, Utils.DicReviewRatingKey, Utils.DicNumerOfHelpKey ]) num = len(reviewWholeList) for i in range(num): user = reviewWholeList[i][Utils.DicUserNameKey] hotel = reviewWholeList[i][Utils.DicTouristFunctionKey] date = reviewWholeList[i][Utils.DicReviewDateKey] title = reviewWholeList[i][Utils.DicTitleKey] text = reviewWholeList[i][Utils.DicCoomentKey] sourceLang = reviewWholeList[i][Utils.DicCommentLangKey] sentimentValue = reviewWholeList[i][Utils.DicSentimentKey] keyPhrasesValue = reviewWholeList[i][Utils.DicKeyPhrasesKey] rate = reviewWholeList[i][Utils.DicReviewRatingKey] numberOfHelp = reviewWholeList[i][Utils.DicNumerOfHelpKey] spamwriter.writerow([ user, hotel, date, title, text, sourceLang, sentimentValue, keyPhrasesValue, rate, numberOfHelp ]) csvfile.close() except: logging.exception('Fail to insert record to ' + reviewCSVFile) pass print('finished')
def Scraping_Procedure(processUrl, reviewDic, userDic, touristFunctionName, touristFunction_Type, lock, dbClient): for pUrl in processUrl: try: print(pUrl) driver = webdriver.PhantomJS(executable_path=Utils.PhantomJsPath) time.sleep(2) driver.get(pUrl) time.sleep(1) link = Ensure_Click_AllLanguage_RadioButton( driver, pUrl, touristFunction_Type).find_element_by_xpath( "//*[@class='taLnk ulBlueLinks']") try: if link is not None and link.text == 'More': link.click() time.sleep(2) except: logging.exception('fail to click more link in ' + pUrl) print('link can not click') pHtml = driver.page_source pSoup = BeautifulSoup(pHtml, 'html.parser') driver.quit() pContain = pSoup.find( 'div', { 'id': Topic_Location_Two_Column_ID(touristFunction_Type) }).find('div', { 'id': Review_Container_ID(touristFunction_Type) }).findAll('div', {'class': 'review-container'}) for con in pContain: userName = '' reviewDate = '' quoteTitle = '' langUsed = '' memberInfoUrl = '' badgeLevel = '' ageSinceValue = '' hometownValue = '' ageAdditionalInfo = '' age = '' gender = '' commentText = '' rating = 0 tagBlockValue = [] contribution = 0 cityVisited = 0 helpfulVotes = 0 photos = 0 ratingsGiven = 0 forumPosts = 0 excellentNum = 0 veryGoodNum = 0 averageNum = 0 poorNum = 0 terribleNum = 0 numHelp = 0 userInfoDiv = con.find('div', { 'class': 'ui_column is-2' }).find('div', {'class': 'prw_rup prw_reviews_member_info_hsx'}) commentInfoDiv = con.find('div', {'class': 'ui_column is-9'}) memberInfoDiv = userInfoDiv.find('div', {'class': 'member_info'}) memberOverlay = memberInfoDiv.find( 'div', {'class': 'memberOverlayLink'}) if memberOverlay is None: print(pUrl) print(memberInfoDiv) else: userId = memberOverlay.get('id') splitIndex = userId.index('-') memberIdStr = userId[:splitIndex] memberId = memberIdStr[4:] srcStr = userId[splitIndex + 1:] src = srcStr[4:] memberInfoUrlEncode = 'Uid=' + str( memberId) + '&c=&src=' + str(src) memberInfoUrl = 'https://www.tripadvisor.com.sg/MemberOverlay?Mode=owa&' + memberInfoUrlEncode + Member_Overlay_Url_Query_Key( touristFunction_Type) print(memberInfoUrl) # get rating ratingInfo = commentInfoDiv.find( 'div', {'class': 'rating reviewItemInline'}) ratingIcon = ratingInfo.find('span').get('class') if len(ratingIcon) == 2: ratingText = ratingIcon[1] ratingText = ratingText.replace('bubble_', '')[:1] rating = int(ratingText) # get rating date reviewDate = ratingInfo.find('span', { 'class': 'ratingDate relativeDate' }).get('title') quoteDiv = commentInfoDiv.find('div', {'class': 'quote'}) quoteUrl = quoteDiv.find('a').get('href') quoteSpan = quoteDiv.find('a').find('span', {'class': 'noQuotes'}) quoteTitle = Translate_Text( quoteSpan.text)[Utils.Google_TranslatedKey] # if is_utf8(quoteTitle) == True: # quoteTitle = quoteTitle.encode('utf-8') # get review comment commentTextContainer = commentInfoDiv.find( 'div', {'class': 'prw_rup prw_reviews_text_summary_hsx'}) if commentTextContainer is not None: commentTextField = commentTextContainer.find( 'p', {'class': 'partial_entry'}) commentText = commentTextField.text moreLink = commentTextField.find( 'span', {'class': 'taLnk ulBlueLinks'}) if moreLink is not None: if moreLink.text == 'More': print(commentText) translationC = Translate_Text( commentText.replace('<br>', '\r\n')) commentText = translationC[Utils.Google_TranslatedKey] langUsed = translationC[Utils.Google_SourceLangKey] sentiment = TextAUtils.Get_Sentiment_Result(commentText) keyPhrases = TextAUtils.Get_KeyPhrases_From_Comment( commentText) # get number of thumbsup helpfulContentDiv = commentInfoDiv.find( 'div', {'class': 'helpful redesigned hsx_helpful'}) numHelpSpan = helpfulContentDiv.find( 'span', {'class': 'numHelp emphasizeWithColor'}) numHelpText = numHelpSpan.text.strip() if numHelpText == '': numHelp = 0 else: numHelp = int(numHelpText) if memberInfoUrl != '': memberInfoHtml = requests.get(memberInfoUrl) mSoup = BeautifulSoup(memberInfoHtml.content, 'html.parser') mOverlayDiv = mSoup.find( 'div', {'class': 'memberOverlayRedesign g10n'}) userSection = mOverlayDiv.find('a') userLink = userSection.get('href') userName = userSection.find( 'h3', { 'class': 'username reviewsEnhancements' }).text if Is_NoneOrEmpty_String(userName): userName = str(uuid.uuid1()) memberReviewBadgeDiv = mOverlayDiv.find( 'div', {'class': 'memberreviewbadge'}) if memberReviewBadgeDiv is not None: badgeInfoDiv = memberReviewBadgeDiv.find( 'div', {'class': 'badgeinfo'}) if badgeInfoDiv is not None: badgeLevel = badgeInfoDiv.text.replace( 'Level', '').replace('Contributor', '').strip() reviewCountContainer = mOverlayDiv.find( 'ul', {'class': 'countsReviewEnhancements'}) # temporary remove this part, will get them from the member profile page if reviewCountContainer is not None: reviewEnhancementList = reviewCountContainer.findAll('li') if reviewEnhancementList is not None: for list in reviewEnhancementList: spanValue = list.find( 'span', { 'class': 'badgeTextReviewEnhancements' }).text if list.find( 'span', { 'class', 'ui_icon globe-world iconReviewEnhancements' }) is not None: cityVisited = int( spanValue.replace('Cities visited', '').replace( 'City visited', '').strip()) reviewContributionWrap = mOverlayDiv.find( 'div', {'class': 'wrap container histogramReviewEnhancements'}) if reviewContributionWrap is not None: reviewContributionDiv = reviewContributionWrap.find('ul') if reviewContributionDiv is not None: for chartRowDiv in reviewContributionDiv.findAll( 'div', {'class': 'chartRowReviewEnhancements'}): reviewCategory = chartRowDiv.find( 'span', { 'class', 'rowLabelReviewEnhancements rowCellReviewEnhancements' }).text.strip() number = chartRowDiv.find( 'span', { 'class': 'rowCountReviewEnhancements rowCellReviewEnhancements' }).text if reviewCategory == 'Excellent': excellentNum = int(number) if reviewCategory == 'Very good': veryGoodNum = int(number) if reviewCategory == 'Average': averageNum = int(number) if reviewCategory == 'Poor': poorNum = int(number) if reviewCategory == 'Terrible': terribleNum = int(number) userDetailHtml = requests.get(Utils.WebsiteUrl + userLink) userDetailSoup = BeautifulSoup(userDetailHtml.content, 'html.parser') userDetailDiv = userDetailSoup.find( 'div', {'class': 'modules-membercenter-member-profile '}) userMembershipDiv = userDetailDiv.find('div', {'class': 'profInfo'}) if userMembershipDiv is not None: ageSinceDiv = userMembershipDiv.find( 'div', {'class', 'ageSince'}) if ageSinceDiv is not None: ageSinceValue = ageSinceDiv.find( 'p', { 'class': 'since' }).text.replace('Since', '').strip() if ageSinceValue == 'this month' or ageSinceValue == 'this week' or ageSinceValue == 'today': ageSinceValue = Utils.Get_Current_Month_Year_String( ) else: ageSinceValue = Utils.Parser_Month_Year_String( ageSinceValue) allAgeInfo = ageSinceDiv.findAll('p') if len(allAgeInfo) > 1: ageAdditionalInfo = allAgeInfo[1].text age = Get_Age_From_Text(ageAdditionalInfo) gender = Get_Gender_From_Text(ageAdditionalInfo) hometownDiv = userMembershipDiv.find( 'div', {'class': 'hometown'}) if hometownDiv is not None: homeTownP = hometownDiv.find('p') if homeTownP is not None: hometownValue = homeTownP.text memberTagDiv = userDetailSoup.find( 'div', {'class': 'modules-membercenter-member-tag '}) if memberTagDiv is not None: tagBlock = memberTagDiv.find('div', {'class': 'tagBlock'}) if tagBlock is not None: for tagBubble in tagBlock.findAll( 'div', {'class': 'tagBubble unclickable'}): tagBlockValue.append(tagBubble.text) else: tagBlockValue.append('') else: tagBlockValue.append('') # profile summary profileSummaryDiv = userDetailSoup.find( 'div', {'class': 'modules-membercenter-content-summary '}) if profileSummaryDiv is not None: memberPointDiv = profileSummaryDiv.find( 'div', {'class': 'member-points'}) if memberPointDiv is not None: for pointLi in memberPointDiv.findAll( 'li', {'class': 'content-info'}): pointLiReview = pointLi.find( 'a', {'name': 'reviews'}) if pointLiReview is not None: contribution = int( pointLiReview.text.replace('Reviews', '').replace( 'Review', '').strip()) pointLiRatings = pointLi.find( 'a', {'name': 'ratings'}) if pointLiRatings is not None: ratingsGiven = int( pointLiRatings.text.replace( 'Ratings', '').replace('Rating', '').strip()) pointLiForumPosts = pointLi.find( 'a', {'name': 'forums'}) if pointLiForumPosts is not None: forumPosts = int( pointLiForumPosts.text.replace( 'Forum Posts', '').replace('Forum Post', '').strip()) pointLiPhotos = pointLi.find( 'a', {'name': 'photos'}) if pointLiPhotos is not None: photos = int( pointLiPhotos.text.replace('Photos', '').replace( 'Photo', '').strip()) pointLiHelpfulVotes = pointLi.find( 'a', {'name': 'lists'}) if pointLiHelpfulVotes is not None: helpfulVotes = int( pointLiHelpfulVotes.text.replace( 'Helpful votes', '').replace('Helpful vote', '').strip()) reviewDocument = { Utils.DicUserNameKey: userName, Utils.DicTouristFunctionKey: touristFunctionName, Utils.DicReviewDateKey: DateUtils.parserDateString(reviewDate), Utils.DicTitleKey: quoteTitle, Utils.DicCoomentKey: commentText, Utils.DicCommentLangKey: langUsed, Utils.DicSentimentKey: sentiment, Utils.DicKeyPhrasesKey: keyPhrases, Utils.DicReviewRatingKey: rating, Utils.DicNumerOfHelpKey: numHelp } userDocument = { Utils.DicUserNameKey: userName, Utils.DicBadgeLevelKey: badgeLevel, Utils.DicTagsKey: tagBlockValue, Utils.DicHomeTownKey: hometownValue, Utils.DicAgeSinceKey: ageSinceValue, Utils.DicAgeRangeKey: age, Utils.DicGenderKey: gender, Utils.DicUserContributionKey: contribution, Utils.dicCitiesVisitedKey: cityVisited, Utils.DicHelpfulVotesKey: helpfulVotes, Utils.DicPhotosKey: photos, Utils.DicForumPostKey: forumPosts, Utils.DicRatingGivenKey: ratingsGiven, Utils.DicExcellentNumKey: excellentNum, Utils.DicVeryGoodNumKey: veryGoodNum, Utils.DicAverageNumKey: averageNum, Utils.DicPoorNumKey: poorNum, Utils.DicTerribleNumKey: terribleNum } with lock: try: MongoHelper.Insert_Data_IntoMongoDB( Utils.TripAdvisor_DB, Utils.Reviews_Table, dbClient, reviewDocument) MongoHelper.Insert_Data_IntoMongoDB( Utils.TripAdvisor_DB, Utils.Users_Table, dbClient, userDocument, Utils.DicUserNameKey, userName) except Exception as e: logging.exception('error in insert into database : ' + e.__str__()) reviewDic.append(reviewDocument) userDic.append(userDocument) except Exception as e: logging.exception('Error' + e.__str__() + ' in : ' + pUrl) pass return reviewDic, userDic
def Show_Avg_Review_Rating_DateRange(renovateStart, renovationEnd, functionName, keyword='', daysBefore=365, daysAfter=365): renovateStart_Date = DateUtils.parserDate(renovateStart) oneDayBefore = One_Day_Before(renovateStart_Date) oneYearBefore = Get_Past_Date_Object(oneDayBefore, daysBefore) renovateEnd_Date = DateUtils.parserDate(renovationEnd) oneDayAfter = One_Day_After(renovateEnd_Date) oneYearAfter = Get_Past_Date_Object(oneDayAfter, -daysAfter) latestRecord = MongoUtils.getLatestDate(collection, functionName) if oneYearAfter > latestRecord: oneYearAfter = latestRecord before_renovation = MongoUtils.averageReviewRating_DateRange( collection, oneYearBefore, oneDayBefore, functionName, keyword) undergoing_renovation = MongoUtils.averageReviewRating_DateRange( collection, renovateStart_Date, renovateEnd_Date, functionName, keyword) after_renovation = MongoUtils.averageReviewRating_DateRange( collection, oneDayAfter, oneYearAfter, functionName, keyword) df_before = pd.DataFrame(list(before_renovation)) df_undergoing = pd.DataFrame(list(undergoing_renovation)) df_after = pd.DataFrame(list(after_renovation)) Raw_Data = { Utils.TimeLine_Col: [1, 2, 3], Utils.Average_Rating_Col: [ Utils.FormatFloatWithTwoDecimalPlace(df_before['avgRating'][0]), Utils.FormatFloatWithTwoDecimalPlace( df_undergoing['avgRating'][0]), Utils.FormatFloatWithTwoDecimalPlace(df_after['avgRating'][0]) ] } plt.figure(1) plt.subplot(211) plt.xticks(Raw_Data[Utils.TimeLine_Col], [ Get_Display_DateRange(oneYearBefore, oneDayBefore), Get_Display_DateRange(renovateStart, renovationEnd), Get_Display_DateRange(oneDayAfter, oneYearAfter) ]) plt.axhline(y=Raw_Data[Utils.Average_Rating_Col][0], xmin=0, xmax=1 / 6, c=Utils.Orange_Color, linestyle='dashed') plt.axhline(y=Raw_Data[Utils.Average_Rating_Col][1], xmin=0, xmax=0.5, c=Utils.Light_Green_Color, linestyle='dashed') plt.axhline(y=Raw_Data[Utils.Average_Rating_Col][2], xmin=0, xmax=5 / 6, c=Utils.Blue_Color, linestyle='dashed') plt.ylabel('Rating Mean') plt.plot(Raw_Data[Utils.TimeLine_Col], Raw_Data[Utils.Average_Rating_Col], 'bo') kw = Get_Append_Keyword_Title(keyword) plt.title(functionName + ('\n Rating Mean' + kw if kw != '' else ''), loc='center') plt.axis([ 0.5, 3.5, min(Raw_Data[Utils.Average_Rating_Col]) - 0.5, max(Raw_Data[Utils.Average_Rating_Col]) + 0.5 ]) plt.show()
def Show_Sentiment_Stacked_Bar_DateRange(renovateStart, renovationEnd, functionName, keyword='', daysBefore=365, daysAfter=365): renovateStart_Date = DateUtils.parserDate(renovateStart) oneDayBefore = One_Day_Before(renovateStart) oneYearBefore = Get_Past_Date_Object(oneDayBefore, daysBefore) renovateEnd_Date = DateUtils.parserDate(renovationEnd) oneDayAfter = One_Day_After(renovateEnd_Date) oneYearAfter = Get_Past_Date_Object(oneDayAfter, -daysAfter) latestRecord = MongoUtils.getLatestDate(collection, functionName) if oneYearAfter > latestRecord: oneYearAfter = latestRecord before_renovation = MongoUtils.aggregateSentiment(collection, oneYearBefore, oneDayBefore, functionName, keyword) undergoing_renovation = MongoUtils.aggregateSentiment( collection, renovateStart_Date, renovateEnd_Date, functionName, keyword) after_renovation = MongoUtils.aggregateSentiment(collection, oneDayAfter, oneYearAfter, functionName, keyword) df_before = pd.DataFrame(list(before_renovation)) df_before = df_before.sort_values(by=['_id']) df_undergoing = pd.DataFrame(list(undergoing_renovation)) df_undergoing = df_undergoing.sort_values(by=['_id']) df_after = pd.DataFrame(list(after_renovation)) df_after = df_after.sort_values(by=['_id']) if df_before.empty != True and df_undergoing.empty != True and df_after.empty != True: name_Value = [ Utils.Negative_Col.lower(), Utils.Neutral_Col.lower(), Utils.Positive_Col.lower() ] # after sort the dataframe, the name should be in this order before_V = [0, 0, 0] undergoing_V = [0, 0, 0] after_V = [0, 0, 0] for index, row in df_before.iterrows(): p = name_Value.index( row['_id']) ## to adapt dataframe doesn't have all the columns if p != -1: before_V[p] = row['total'] for index, row in df_undergoing.iterrows(): p = name_Value.index( row['_id']) ## to adapt dataframe doesn't have all the columns if p != -1: undergoing_V[p] = row['total'] for index, row in df_after.iterrows(): p = name_Value.index( row['_id']) ## to adapt dataframe doesn't have all the columns if p != -1: after_V[p] = row['total'] raw_data = { Utils.TimeLine_Col: [ Get_Display_DateRange(oneYearBefore, oneDayBefore), Get_Display_DateRange(renovateStart, renovationEnd), Get_Display_DateRange(oneDayAfter, oneYearAfter) ], Utils.Negative_Col: [before_V[0], undergoing_V[0], after_V[0]], Utils.Neutral_Col: [before_V[1], undergoing_V[1], after_V[1]], Utils.Positive_Col: [before_V[2], undergoing_V[2], after_V[2]] } df = pd.DataFrame(raw_data, columns=[ Utils.TimeLine_Col, Utils.Negative_Col, Utils.Neutral_Col, Utils.Positive_Col ]) # Create a figure with a single subplot f, ax = plt.subplots(1, figsize=(6, 3)) # Set bar width at 1 bar_width = 1 # positions of the left bar-boundaries bar_l = [i for i in range(len(df[Utils.Negative_Col]))] # positions of the x-axis ticks (center of the bars as bar labels) tick_pos = [i + (bar_width / 2) for i in bar_l] # Create the total score for each participant totals = [ i + j + k for i, j, k in zip(df[Utils.Negative_Col], df[ Utils.Neutral_Col], df[Utils.Positive_Col]) ] # Create the percentage of the total score the negative value for each participant was negative_R = [ i / j * 100 for i, j in zip(df[Utils.Negative_Col], totals) ] # Create the percentage of the total score the neutral value for each participant was neutral_R = [ i / j * 100 for i, j in zip(df[Utils.Neutral_Col], totals) ] # Create the percentage of the total score the positive value for each participant was positive_R = [ i / j * 100 for i, j in zip(df[Utils.Positive_Col], totals) ] # Create a bar chart in negative p1 = ax.bar( bar_l, # using negative_R data negative_R, # labeled label=Utils.Before_Renovation_Col, # with alpha alpha=0.9, # with color orange color=Utils.Orange_Color, # with bar width width=bar_width, # with border color edgecolor='white') # Create a bar chart in position bar_1 p2 = ax.bar( bar_l, # using neutral_R data neutral_R, # with negative_R bottom=negative_R, # labeled label=Utils.Undergoing_Renovation_Col, # with alpha alpha=0.9, # with color light green color=Utils.Light_Green_Color, # with bar width width=bar_width, # with border color edgecolor='white') # Create a bar chart in position bar_1 p3 = ax.bar( bar_l, # using positive_R data positive_R, # with negative_R and neutral_R on bottom bottom=[i + j for i, j in zip(negative_R, neutral_R)], # labeled label=Utils.After_Renovation_Col, # with alpha alpha=0.9, # with color blue color=Utils.Blue_Color, # with bar width width=bar_width, # with border color edgecolor='white') # Set the ticks to be Stages plt.xticks(tick_pos, df[Utils.TimeLine_Col]) def to_percent(y, position): # Ignore the passed in position. This has the effect of scaling the default # tick locations. s = str(y) # The percent symbol needs escaping in latex if matplotlib.rcParams['text.usetex'] is True: return s + r'$\%$' else: return s + '%' formatter = FuncFormatter(to_percent) # Set the formatter plt.gca().yaxis.set_major_formatter(formatter) for i, v in enumerate(df[Utils.Negative_Col]): ax.text(v, i, str(v), color='blue', fontweight='bold') kw = Get_Append_Keyword_Title(keyword) plt.title(functionName + ('\nSentiment Comparison' + kw if kw != '' else ' Sentiment Comparison'), loc='center') # Let the borders of the graphic plt.xlim( [min(tick_pos) - bar_width * 1.1, max(tick_pos) + bar_width * 0.4]) plt.ylim(-5, 105) plt.text(x=-0.1, y=negative_R[0] / 2, text=Utils.Format_To_Percentage_With_OneFloatPoint( negative_R[0]), ha='left', s=40) plt.text(x=0.9, y=negative_R[1] / 2, text=Utils.Format_To_Percentage_With_OneFloatPoint( negative_R[1]), ha='left', s=40) plt.text(x=2, y=negative_R[2] / 2, text=Utils.Format_To_Percentage_With_OneFloatPoint( negative_R[2]), ha='left', s=40) # rotate axis labels plt.setp(plt.gca().get_xticklabels(), rotation=0, horizontalalignment='right') plt.legend((p3[0], p2[0], p1[0]), (Utils.Positive_Col, Utils.Neutral_Col, Utils.Negative_Col), loc='center right') # shot plot plt.show() else: print('no record found')