def Store_Into_MongoDB(dicCrawl, client):
    for key, value in dicCrawl.items():
        MongoHelper.Insert_CSV_Users_Into_MongoDB(
            Utils.Get_Result_CSV_File(value, Utils.Result_Type_User_Profile),
            client)
        MongoHelper.Insert_CSV_Reviews_Into_MongoDB(
            Utils.Get_Result_CSV_File(value, Utils.Result_Type_Reviews),
            client)
def Scrap_TouristFunction_Information(touristFunctionDic, touristFunctionType,
                                      client):
    csvFileName = Utils.Get_Result_CSV_File(touristFunctionType,
                                            Utils.Result_Type_TouristFunction)
    dict = {}
    with open(csvFileName, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow([
            Utils.TouristFunction_Name_Key, Utils.TouristFunction_Type_Key,
            Utils.TouristFunction_Overall_Rating_Key,
            Utils.TouristFunction_Ranking_Key,
            Utils.TouristFunction_PriceRange_Key,
            Utils.TouristFunction_SubType_Key,
            Utils.TouristFunction_Address_Key,
            Utils.TouristFunction_Country_Key,
            Utils.TouristFunction_Postal_Code_Key,
            Utils.TouristFunction_Phone_Key, Utils.TouristFunction_Websit_Key,
            Utils.TouristFunction_Email_Key
        ])
        f.close()
    for key, value in touristFunctionDic.items():
        try:
            websiteUrl = Get_Tourist_Function_Url(touristFunctionType, key)
            overlayUrl = Utils.WebsiteUrl + '/EmailHotel?detail=' + key.lstrip(
                'd')  # for hotel email

            html = requests.get(websiteUrl)
            soup = BeautifulSoup(html.content, 'html.parser')

            html2 = requests.get(overlayUrl)
            soup2 = BeautifulSoup(html2.content, 'html.parser')

            bizInfo = soup.find(
                'div',
                {'id': Taplc_Location_Detail_Header(touristFunctionType)})

            bizName = bizInfo.find('h1').text.strip()
            bizRating = bizInfo.find(
                'span', {'class': 'ui_bubble_rating'})['alt'].split()[0]
            bizRankB = bizInfo.find('span', {
                'class': 'header_popularity'
            }).find('b')
            bizRank = ''
            bizSubType = []
            bizPriceRange = ''
            if bizRankB is not None:
                bizRankBSpan = bizRankB.find('span')
                if bizRankBSpan is not None:
                    bizRank = bizRankBSpan.text.lstrip('#')
                else:
                    bizRank = bizRankB.text.lstrip('#')
            bizStreetAddress = bizInfo.find('span', {
                'class': 'street-address'
            }).text
            bizPostalCode = bizInfo.find('span', {
                'class': 'locality'
            }).text.rstrip(', ')
            bizCountry = bizInfo.find('span', {
                'class': 'country-name'
            }).text.strip()
            bizPriceRangeClass = bizInfo.find('span', {'class': 'header_tags'})
            if bizPriceRangeClass is not None:
                bizPriceRange = bizPriceRangeClass.text.strip()
            bizSubTypeClass = bizInfo.find('span', {'class': 'header_links'})
            if bizSubTypeClass is not None:
                bizSubTypeLinkText = bizSubTypeClass.findAll('a')
                if bizSubTypeLinkText is not None:
                    for linkT in bizSubTypeLinkText:
                        bizSubType.append(linkT.text.strip())
            else:
                bizSubTypeClass = bizInfo.findAll('span',
                                                  {'class': 'header_detail'})
                if bizSubTypeClass is not None:
                    for span in bizSubTypeClass:
                        detailDiv = span.findAll('div', {'class': 'detail'})
                        if detailDiv is not None:
                            for div in detailDiv:
                                details = div.findAll('a')
                                if details is not None:
                                    for detail in details:
                                        bizSubType.append(detail.text)
            bizPhone = ''
            bizEmail = ''
            # some tourist function does not list phone number
            try:
                bizPhoneObject = bizInfo.find('div',
                                              {'class': 'blEntry phone'})
                if bizPhoneObject is not None:
                    bizPhoneSpan = bizPhoneObject.findAll('span')
                    if bizPhoneSpan is not None:
                        length = len(bizPhoneSpan)
                        bizPhoneScript = bizPhoneSpan[length -
                                                      1].find('script')
                        if bizPhoneScript is not None:
                            bizPhone = bizPhoneScript.text
                            bizPhone = Utils.RejoinPhoneNumber(
                                bizPhone
                            )  # number sequence is scrambled in souce code
            except Exception as e:
                logging.exception('error in scrapping phone information : ' +
                                  e.__str__())

            # some tourist function does not list email
            try:
                emailObject = soup2.find('input', {'id': 'receiver'})
                if emailObject is not None:
                    bizEmail = emailObject['value']
            except Exception as e:
                logging.exception('error in scrapping email information : ' +
                                  e.__str__())
            bizWebsite = Query_Tourist_Function_Website_URL(value)
            with open(csvFileName, 'a', newline='', encoding='utf-8') as f:
                spamwriter = csv.writer(f)
                spamwriter.writerow([
                    bizName, touristFunctionType, bizRating, bizRank,
                    bizPriceRange, bizSubType, bizStreetAddress, bizCountry,
                    bizPostalCode, bizPhone, bizWebsite, bizEmail
                ])
                f.close()
            dict = {
                Utils.TouristFunction_Name_Key: bizName,
                Utils.TouristFunction_Type_Key: touristFunctionType,
                Utils.TouristFunction_Overall_Rating_Key: bizRating,
                Utils.TouristFunction_Ranking_Key: bizRank,
                Utils.TouristFunction_PriceRange_Key: bizPriceRange,
                Utils.TouristFunction_SubType_Key: bizSubType,
                Utils.TouristFunction_Address_Key: bizStreetAddress,
                Utils.TouristFunction_Country_Key: bizCountry,
                Utils.TouristFunction_Postal_Code_Key: bizPostalCode,
                Utils.TouristFunction_Phone_Key: bizPhone,
                Utils.TouristFunction_Websit_Key: bizWebsite,
                Utils.TouristFunction_Email_Key: bizEmail
            }
            MongoHelper.Insert_Data_IntoMongoDB(Utils.TripAdvisor_DB,
                                                Utils.TouristFunction_Table,
                                                client, dict,
                                                Utils.DicTouristFunctionKey,
                                                bizName)
        except Exception as e:
            logging.exception('error in scrapping tourist function : ' +
                              e.__str__())
def Get_Tourist_Function_Url(touristFunctionType, touristFunctionID):
    return Utils.Get_Base_Url(
        touristFunctionType) + touristFunctionID + '-Reviews'
def Crawl_TouristFunction(baseUrl, touristFunctionName, touristFunction_Type,
                          dbClient):
    reviewCollection = ''
    userCollection = ''
    lock = threading.Lock()
    userWholeList = []
    reviewWholeList = []
    userRes1 = []
    reviewRes1 = []
    userRes2 = []
    reviewRes2 = []
    try:

        dic = Get_All_Page_Url(baseUrl, touristFunction_Type)
        allPageUrls = dic[Utils.UrlKey]
        totalRecord = dic[Utils.TotalNumKey]
        halfNum = round(len(allPageUrls) / 2)
        t1 = threading.Thread(target=Scraping_Procedure,
                              args=(allPageUrls[:halfNum], reviewRes1,
                                    userRes1, touristFunctionName,
                                    touristFunction_Type, lock, dbClient))
        t2 = threading.Thread(target=Scraping_Procedure,
                              args=(allPageUrls[halfNum:], reviewRes2,
                                    userRes2, touristFunctionName,
                                    touristFunction_Type, lock, dbClient))

        t1.start()
        t2.start()
        t1.join()
        t2.join()

        print(touristFunctionName + ' finish')

        userWholeList.extend(userRes1)
        userWholeList.extend(userRes2)
        reviewWholeList.extend(reviewRes1)
        reviewWholeList.extend(reviewRes2)
    except Exception as e:
        logging.exception('Exception : ' + e.__str__())
        pass
    userProfileCSVFile = Utils.Get_Result_CSV_File(
        touristFunctionName, Utils.Result_Type_User_Profile)
    reviewCSVFile = Utils.Get_Result_CSV_File(touristFunctionName,
                                              Utils.Result_Type_Reviews)
    with open(userProfileCSVFile, 'w', newline='',
              encoding='utf-8') as profileCSV:
        try:
            profileWriter = csv.writer(profileCSV)
            profileWriter.writerow([
                Utils.DicUserNameKey, Utils.DicBadgeLevelKey, Utils.DicTagsKey,
                Utils.DicHomeTownKey, Utils.DicAgeSinceKey,
                Utils.DicAgeRangeKey, Utils.DicGenderKey,
                Utils.DicUserContributionKey, Utils.dicCitiesVisitedKey,
                Utils.DicHelpfulVotesKey, Utils.DicPhotosKey,
                Utils.DicForumPostKey, Utils.DicRatingGivenKey,
                Utils.DicExcellentNumKey, Utils.DicVeryGoodNumKey,
                Utils.DicAverageNumKey, Utils.DicPoorNumKey,
                Utils.DicTerribleNumKey
            ])
            num = len(userWholeList)
            for i in range(num):
                user = userWholeList[i][Utils.DicUserNameKey]
                badge = userWholeList[i][Utils.DicBadgeLevelKey]
                hometown = userWholeList[i][Utils.DicHomeTownKey]
                membership = userWholeList[i][Utils.DicAgeSinceKey]
                ageValue = userWholeList[i][Utils.DicAgeRangeKey]
                genderValue = userWholeList[i][Utils.DicGenderKey]
                userContribution = userWholeList[i][
                    Utils.DicUserContributionKey]
                cityVisit = userWholeList[i][Utils.dicCitiesVisitedKey]
                helpfulV = userWholeList[i][Utils.DicHelpfulVotesKey]
                postPhotos = userWholeList[i][Utils.DicPhotosKey]
                forumPosted = userWholeList[i][Utils.DicForumPostKey]
                ratingGivenN = userWholeList[i][Utils.DicRatingGivenKey]
                excellent = userWholeList[i][Utils.DicExcellentNumKey]
                veryGood = userWholeList[i][Utils.DicVeryGoodNumKey]
                average = userWholeList[i][Utils.DicAverageNumKey]
                poor = userWholeList[i][Utils.DicPoorNumKey]
                terrible = userWholeList[i][Utils.DicTerribleNumKey]
                tags = userWholeList[i][Utils.DicTagsKey]
                profileWriter.writerow([
                    user, badge, tags, hometown, membership, ageValue,
                    genderValue, userContribution, cityVisit, helpfulV,
                    postPhotos, forumPosted, ratingGivenN, excellent, veryGood,
                    average, poor, terrible
                ])
            profileCSV.close()
        except:
            logging.exception('Fail to insert data to ' + userProfileCSVFile)
            pass

    with open(reviewCSVFile, 'w', newline='', encoding='utf-8') as csvfile:
        try:
            spamwriter = csv.writer(csvfile)
            spamwriter.writerow([
                Utils.DicUserNameKey, Utils.DicTouristFunctionKey,
                Utils.DicReviewDateKey, Utils.DicTitleKey, Utils.DicCoomentKey,
                Utils.DicCommentLangKey, Utils.DicSentimentKey,
                Utils.DicKeyPhrasesKey, Utils.DicReviewRatingKey,
                Utils.DicNumerOfHelpKey
            ])
            num = len(reviewWholeList)
            for i in range(num):
                user = reviewWholeList[i][Utils.DicUserNameKey]
                hotel = reviewWholeList[i][Utils.DicTouristFunctionKey]
                date = reviewWholeList[i][Utils.DicReviewDateKey]
                title = reviewWholeList[i][Utils.DicTitleKey]
                text = reviewWholeList[i][Utils.DicCoomentKey]
                sourceLang = reviewWholeList[i][Utils.DicCommentLangKey]
                sentimentValue = reviewWholeList[i][Utils.DicSentimentKey]
                keyPhrasesValue = reviewWholeList[i][Utils.DicKeyPhrasesKey]
                rate = reviewWholeList[i][Utils.DicReviewRatingKey]
                numberOfHelp = reviewWholeList[i][Utils.DicNumerOfHelpKey]
                spamwriter.writerow([
                    user, hotel, date, title, text, sourceLang, sentimentValue,
                    keyPhrasesValue, rate, numberOfHelp
                ])
            csvfile.close()
        except:
            logging.exception('Fail to insert record to ' + reviewCSVFile)
            pass
    print('finished')
def Scraping_Procedure(processUrl, reviewDic, userDic, touristFunctionName,
                       touristFunction_Type, lock, dbClient):
    for pUrl in processUrl:
        try:
            print(pUrl)
            driver = webdriver.PhantomJS(executable_path=Utils.PhantomJsPath)
            time.sleep(2)
            driver.get(pUrl)
            time.sleep(1)
            link = Ensure_Click_AllLanguage_RadioButton(
                driver, pUrl, touristFunction_Type).find_element_by_xpath(
                    "//*[@class='taLnk ulBlueLinks']")
            try:
                if link is not None and link.text == 'More':
                    link.click()
                    time.sleep(2)
            except:
                logging.exception('fail to click more link in ' + pUrl)
                print('link can not click')
            pHtml = driver.page_source
            pSoup = BeautifulSoup(pHtml, 'html.parser')
            driver.quit()
            pContain = pSoup.find(
                'div', {
                    'id': Topic_Location_Two_Column_ID(touristFunction_Type)
                }).find('div', {
                    'id': Review_Container_ID(touristFunction_Type)
                }).findAll('div', {'class': 'review-container'})
            for con in pContain:
                userName = ''
                reviewDate = ''
                quoteTitle = ''
                langUsed = ''
                memberInfoUrl = ''
                badgeLevel = ''
                ageSinceValue = ''
                hometownValue = ''
                ageAdditionalInfo = ''
                age = ''
                gender = ''
                commentText = ''
                rating = 0
                tagBlockValue = []
                contribution = 0
                cityVisited = 0
                helpfulVotes = 0
                photos = 0
                ratingsGiven = 0
                forumPosts = 0
                excellentNum = 0
                veryGoodNum = 0
                averageNum = 0
                poorNum = 0
                terribleNum = 0
                numHelp = 0
                userInfoDiv = con.find('div', {
                    'class': 'ui_column is-2'
                }).find('div',
                        {'class': 'prw_rup prw_reviews_member_info_hsx'})
                commentInfoDiv = con.find('div', {'class': 'ui_column is-9'})
                memberInfoDiv = userInfoDiv.find('div',
                                                 {'class': 'member_info'})
                memberOverlay = memberInfoDiv.find(
                    'div', {'class': 'memberOverlayLink'})
                if memberOverlay is None:
                    print(pUrl)
                    print(memberInfoDiv)
                else:
                    userId = memberOverlay.get('id')
                    splitIndex = userId.index('-')
                    memberIdStr = userId[:splitIndex]
                    memberId = memberIdStr[4:]
                    srcStr = userId[splitIndex + 1:]
                    src = srcStr[4:]
                    memberInfoUrlEncode = 'Uid=' + str(
                        memberId) + '&c=&src=' + str(src)
                    memberInfoUrl = 'https://www.tripadvisor.com.sg/MemberOverlay?Mode=owa&' + memberInfoUrlEncode + Member_Overlay_Url_Query_Key(
                        touristFunction_Type)
                    print(memberInfoUrl)
                # get rating
                ratingInfo = commentInfoDiv.find(
                    'div', {'class': 'rating reviewItemInline'})
                ratingIcon = ratingInfo.find('span').get('class')
                if len(ratingIcon) == 2:
                    ratingText = ratingIcon[1]
                    ratingText = ratingText.replace('bubble_', '')[:1]
                    rating = int(ratingText)
                # get rating date
                reviewDate = ratingInfo.find('span', {
                    'class': 'ratingDate relativeDate'
                }).get('title')
                quoteDiv = commentInfoDiv.find('div', {'class': 'quote'})
                quoteUrl = quoteDiv.find('a').get('href')
                quoteSpan = quoteDiv.find('a').find('span',
                                                    {'class': 'noQuotes'})
                quoteTitle = Translate_Text(
                    quoteSpan.text)[Utils.Google_TranslatedKey]
                # if is_utf8(quoteTitle) == True:
                #     quoteTitle = quoteTitle.encode('utf-8')
                # get review comment
                commentTextContainer = commentInfoDiv.find(
                    'div', {'class': 'prw_rup prw_reviews_text_summary_hsx'})
                if commentTextContainer is not None:
                    commentTextField = commentTextContainer.find(
                        'p', {'class': 'partial_entry'})
                    commentText = commentTextField.text
                    moreLink = commentTextField.find(
                        'span', {'class': 'taLnk ulBlueLinks'})
                    if moreLink is not None:
                        if moreLink.text == 'More':
                            print(commentText)
                translationC = Translate_Text(
                    commentText.replace('<br>', '\r\n'))
                commentText = translationC[Utils.Google_TranslatedKey]
                langUsed = translationC[Utils.Google_SourceLangKey]
                sentiment = TextAUtils.Get_Sentiment_Result(commentText)
                keyPhrases = TextAUtils.Get_KeyPhrases_From_Comment(
                    commentText)
                # get number of thumbsup
                helpfulContentDiv = commentInfoDiv.find(
                    'div', {'class': 'helpful redesigned hsx_helpful'})
                numHelpSpan = helpfulContentDiv.find(
                    'span', {'class': 'numHelp emphasizeWithColor'})
                numHelpText = numHelpSpan.text.strip()
                if numHelpText == '':
                    numHelp = 0
                else:
                    numHelp = int(numHelpText)
                if memberInfoUrl != '':
                    memberInfoHtml = requests.get(memberInfoUrl)
                    mSoup = BeautifulSoup(memberInfoHtml.content,
                                          'html.parser')
                    mOverlayDiv = mSoup.find(
                        'div', {'class': 'memberOverlayRedesign g10n'})
                    userSection = mOverlayDiv.find('a')
                    userLink = userSection.get('href')
                    userName = userSection.find(
                        'h3', {
                            'class': 'username reviewsEnhancements'
                        }).text
                if Is_NoneOrEmpty_String(userName):
                    userName = str(uuid.uuid1())
                memberReviewBadgeDiv = mOverlayDiv.find(
                    'div', {'class': 'memberreviewbadge'})
                if memberReviewBadgeDiv is not None:
                    badgeInfoDiv = memberReviewBadgeDiv.find(
                        'div', {'class': 'badgeinfo'})
                    if badgeInfoDiv is not None:
                        badgeLevel = badgeInfoDiv.text.replace(
                            'Level', '').replace('Contributor', '').strip()
                reviewCountContainer = mOverlayDiv.find(
                    'ul', {'class': 'countsReviewEnhancements'})
                # temporary remove this part, will get them from the member profile page
                if reviewCountContainer is not None:
                    reviewEnhancementList = reviewCountContainer.findAll('li')
                    if reviewEnhancementList is not None:
                        for list in reviewEnhancementList:
                            spanValue = list.find(
                                'span', {
                                    'class': 'badgeTextReviewEnhancements'
                                }).text
                            if list.find(
                                    'span',
                                {
                                    'class',
                                    'ui_icon globe-world iconReviewEnhancements'
                                }) is not None:
                                cityVisited = int(
                                    spanValue.replace('Cities visited',
                                                      '').replace(
                                                          'City visited',
                                                          '').strip())
                reviewContributionWrap = mOverlayDiv.find(
                    'div',
                    {'class': 'wrap container histogramReviewEnhancements'})

                if reviewContributionWrap is not None:
                    reviewContributionDiv = reviewContributionWrap.find('ul')
                    if reviewContributionDiv is not None:
                        for chartRowDiv in reviewContributionDiv.findAll(
                                'div',
                            {'class': 'chartRowReviewEnhancements'}):
                            reviewCategory = chartRowDiv.find(
                                'span', {
                                    'class',
                                    'rowLabelReviewEnhancements rowCellReviewEnhancements'
                                }).text.strip()
                            number = chartRowDiv.find(
                                'span', {
                                    'class':
                                    'rowCountReviewEnhancements rowCellReviewEnhancements'
                                }).text
                            if reviewCategory == 'Excellent':
                                excellentNum = int(number)
                            if reviewCategory == 'Very good':
                                veryGoodNum = int(number)
                            if reviewCategory == 'Average':
                                averageNum = int(number)
                            if reviewCategory == 'Poor':
                                poorNum = int(number)
                            if reviewCategory == 'Terrible':
                                terribleNum = int(number)
                userDetailHtml = requests.get(Utils.WebsiteUrl + userLink)
                userDetailSoup = BeautifulSoup(userDetailHtml.content,
                                               'html.parser')
                userDetailDiv = userDetailSoup.find(
                    'div', {'class': 'modules-membercenter-member-profile '})
                userMembershipDiv = userDetailDiv.find('div',
                                                       {'class': 'profInfo'})

                if userMembershipDiv is not None:
                    ageSinceDiv = userMembershipDiv.find(
                        'div', {'class', 'ageSince'})
                    if ageSinceDiv is not None:
                        ageSinceValue = ageSinceDiv.find(
                            'p', {
                                'class': 'since'
                            }).text.replace('Since', '').strip()
                        if ageSinceValue == 'this month' or ageSinceValue == 'this week' or ageSinceValue == 'today':
                            ageSinceValue = Utils.Get_Current_Month_Year_String(
                            )
                        else:
                            ageSinceValue = Utils.Parser_Month_Year_String(
                                ageSinceValue)
                        allAgeInfo = ageSinceDiv.findAll('p')
                        if len(allAgeInfo) > 1:
                            ageAdditionalInfo = allAgeInfo[1].text
                            age = Get_Age_From_Text(ageAdditionalInfo)
                            gender = Get_Gender_From_Text(ageAdditionalInfo)
                    hometownDiv = userMembershipDiv.find(
                        'div', {'class': 'hometown'})
                    if hometownDiv is not None:
                        homeTownP = hometownDiv.find('p')
                        if homeTownP is not None:
                            hometownValue = homeTownP.text
                memberTagDiv = userDetailSoup.find(
                    'div', {'class': 'modules-membercenter-member-tag '})
                if memberTagDiv is not None:
                    tagBlock = memberTagDiv.find('div', {'class': 'tagBlock'})
                    if tagBlock is not None:
                        for tagBubble in tagBlock.findAll(
                                'div', {'class': 'tagBubble unclickable'}):
                            tagBlockValue.append(tagBubble.text)
                    else:
                        tagBlockValue.append('')
                else:
                    tagBlockValue.append('')
                # profile summary
                profileSummaryDiv = userDetailSoup.find(
                    'div', {'class': 'modules-membercenter-content-summary '})
                if profileSummaryDiv is not None:
                    memberPointDiv = profileSummaryDiv.find(
                        'div', {'class': 'member-points'})
                    if memberPointDiv is not None:
                        for pointLi in memberPointDiv.findAll(
                                'li', {'class': 'content-info'}):
                            pointLiReview = pointLi.find(
                                'a', {'name': 'reviews'})
                            if pointLiReview is not None:
                                contribution = int(
                                    pointLiReview.text.replace('Reviews',
                                                               '').replace(
                                                                   'Review',
                                                                   '').strip())
                            pointLiRatings = pointLi.find(
                                'a', {'name': 'ratings'})
                            if pointLiRatings is not None:
                                ratingsGiven = int(
                                    pointLiRatings.text.replace(
                                        'Ratings', '').replace('Rating',
                                                               '').strip())
                            pointLiForumPosts = pointLi.find(
                                'a', {'name': 'forums'})
                            if pointLiForumPosts is not None:
                                forumPosts = int(
                                    pointLiForumPosts.text.replace(
                                        'Forum Posts',
                                        '').replace('Forum Post', '').strip())
                            pointLiPhotos = pointLi.find(
                                'a', {'name': 'photos'})
                            if pointLiPhotos is not None:
                                photos = int(
                                    pointLiPhotos.text.replace('Photos',
                                                               '').replace(
                                                                   'Photo',
                                                                   '').strip())
                            pointLiHelpfulVotes = pointLi.find(
                                'a', {'name': 'lists'})
                            if pointLiHelpfulVotes is not None:
                                helpfulVotes = int(
                                    pointLiHelpfulVotes.text.replace(
                                        'Helpful votes',
                                        '').replace('Helpful vote',
                                                    '').strip())

                reviewDocument = {
                    Utils.DicUserNameKey: userName,
                    Utils.DicTouristFunctionKey: touristFunctionName,
                    Utils.DicReviewDateKey:
                    DateUtils.parserDateString(reviewDate),
                    Utils.DicTitleKey: quoteTitle,
                    Utils.DicCoomentKey: commentText,
                    Utils.DicCommentLangKey: langUsed,
                    Utils.DicSentimentKey: sentiment,
                    Utils.DicKeyPhrasesKey: keyPhrases,
                    Utils.DicReviewRatingKey: rating,
                    Utils.DicNumerOfHelpKey: numHelp
                }
                userDocument = {
                    Utils.DicUserNameKey: userName,
                    Utils.DicBadgeLevelKey: badgeLevel,
                    Utils.DicTagsKey: tagBlockValue,
                    Utils.DicHomeTownKey: hometownValue,
                    Utils.DicAgeSinceKey: ageSinceValue,
                    Utils.DicAgeRangeKey: age,
                    Utils.DicGenderKey: gender,
                    Utils.DicUserContributionKey: contribution,
                    Utils.dicCitiesVisitedKey: cityVisited,
                    Utils.DicHelpfulVotesKey: helpfulVotes,
                    Utils.DicPhotosKey: photos,
                    Utils.DicForumPostKey: forumPosts,
                    Utils.DicRatingGivenKey: ratingsGiven,
                    Utils.DicExcellentNumKey: excellentNum,
                    Utils.DicVeryGoodNumKey: veryGoodNum,
                    Utils.DicAverageNumKey: averageNum,
                    Utils.DicPoorNumKey: poorNum,
                    Utils.DicTerribleNumKey: terribleNum
                }
                with lock:
                    try:
                        MongoHelper.Insert_Data_IntoMongoDB(
                            Utils.TripAdvisor_DB, Utils.Reviews_Table,
                            dbClient, reviewDocument)
                        MongoHelper.Insert_Data_IntoMongoDB(
                            Utils.TripAdvisor_DB, Utils.Users_Table, dbClient,
                            userDocument, Utils.DicUserNameKey, userName)
                    except Exception as e:
                        logging.exception('error in insert into database : ' +
                                          e.__str__())
                reviewDic.append(reviewDocument)
                userDic.append(userDocument)
        except Exception as e:
            logging.exception('Error' + e.__str__() + ' in : ' + pUrl)
            pass
    return reviewDic, userDic
def Show_Avg_Review_Rating_DateRange(renovateStart,
                                     renovationEnd,
                                     functionName,
                                     keyword='',
                                     daysBefore=365,
                                     daysAfter=365):
    renovateStart_Date = DateUtils.parserDate(renovateStart)
    oneDayBefore = One_Day_Before(renovateStart_Date)
    oneYearBefore = Get_Past_Date_Object(oneDayBefore, daysBefore)
    renovateEnd_Date = DateUtils.parserDate(renovationEnd)
    oneDayAfter = One_Day_After(renovateEnd_Date)
    oneYearAfter = Get_Past_Date_Object(oneDayAfter, -daysAfter)
    latestRecord = MongoUtils.getLatestDate(collection, functionName)
    if oneYearAfter > latestRecord:
        oneYearAfter = latestRecord
    before_renovation = MongoUtils.averageReviewRating_DateRange(
        collection, oneYearBefore, oneDayBefore, functionName, keyword)
    undergoing_renovation = MongoUtils.averageReviewRating_DateRange(
        collection, renovateStart_Date, renovateEnd_Date, functionName,
        keyword)
    after_renovation = MongoUtils.averageReviewRating_DateRange(
        collection, oneDayAfter, oneYearAfter, functionName, keyword)
    df_before = pd.DataFrame(list(before_renovation))
    df_undergoing = pd.DataFrame(list(undergoing_renovation))
    df_after = pd.DataFrame(list(after_renovation))
    Raw_Data = {
        Utils.TimeLine_Col: [1, 2, 3],
        Utils.Average_Rating_Col: [
            Utils.FormatFloatWithTwoDecimalPlace(df_before['avgRating'][0]),
            Utils.FormatFloatWithTwoDecimalPlace(
                df_undergoing['avgRating'][0]),
            Utils.FormatFloatWithTwoDecimalPlace(df_after['avgRating'][0])
        ]
    }
    plt.figure(1)
    plt.subplot(211)
    plt.xticks(Raw_Data[Utils.TimeLine_Col], [
        Get_Display_DateRange(oneYearBefore, oneDayBefore),
        Get_Display_DateRange(renovateStart, renovationEnd),
        Get_Display_DateRange(oneDayAfter, oneYearAfter)
    ])
    plt.axhline(y=Raw_Data[Utils.Average_Rating_Col][0],
                xmin=0,
                xmax=1 / 6,
                c=Utils.Orange_Color,
                linestyle='dashed')
    plt.axhline(y=Raw_Data[Utils.Average_Rating_Col][1],
                xmin=0,
                xmax=0.5,
                c=Utils.Light_Green_Color,
                linestyle='dashed')
    plt.axhline(y=Raw_Data[Utils.Average_Rating_Col][2],
                xmin=0,
                xmax=5 / 6,
                c=Utils.Blue_Color,
                linestyle='dashed')
    plt.ylabel('Rating Mean')
    plt.plot(Raw_Data[Utils.TimeLine_Col], Raw_Data[Utils.Average_Rating_Col],
             'bo')
    kw = Get_Append_Keyword_Title(keyword)
    plt.title(functionName + ('\n Rating Mean' + kw if kw != '' else ''),
              loc='center')
    plt.axis([
        0.5, 3.5,
        min(Raw_Data[Utils.Average_Rating_Col]) - 0.5,
        max(Raw_Data[Utils.Average_Rating_Col]) + 0.5
    ])
    plt.show()
def Show_Sentiment_Stacked_Bar_DateRange(renovateStart,
                                         renovationEnd,
                                         functionName,
                                         keyword='',
                                         daysBefore=365,
                                         daysAfter=365):
    renovateStart_Date = DateUtils.parserDate(renovateStart)
    oneDayBefore = One_Day_Before(renovateStart)
    oneYearBefore = Get_Past_Date_Object(oneDayBefore, daysBefore)
    renovateEnd_Date = DateUtils.parserDate(renovationEnd)
    oneDayAfter = One_Day_After(renovateEnd_Date)
    oneYearAfter = Get_Past_Date_Object(oneDayAfter, -daysAfter)
    latestRecord = MongoUtils.getLatestDate(collection, functionName)
    if oneYearAfter > latestRecord:
        oneYearAfter = latestRecord
    before_renovation = MongoUtils.aggregateSentiment(collection,
                                                      oneYearBefore,
                                                      oneDayBefore,
                                                      functionName, keyword)
    undergoing_renovation = MongoUtils.aggregateSentiment(
        collection, renovateStart_Date, renovateEnd_Date, functionName,
        keyword)
    after_renovation = MongoUtils.aggregateSentiment(collection, oneDayAfter,
                                                     oneYearAfter,
                                                     functionName, keyword)
    df_before = pd.DataFrame(list(before_renovation))
    df_before = df_before.sort_values(by=['_id'])
    df_undergoing = pd.DataFrame(list(undergoing_renovation))
    df_undergoing = df_undergoing.sort_values(by=['_id'])
    df_after = pd.DataFrame(list(after_renovation))
    df_after = df_after.sort_values(by=['_id'])
    if df_before.empty != True and df_undergoing.empty != True and df_after.empty != True:
        name_Value = [
            Utils.Negative_Col.lower(),
            Utils.Neutral_Col.lower(),
            Utils.Positive_Col.lower()
        ]  # after sort the dataframe, the name should be in this order
        before_V = [0, 0, 0]
        undergoing_V = [0, 0, 0]
        after_V = [0, 0, 0]
        for index, row in df_before.iterrows():
            p = name_Value.index(
                row['_id'])  ## to adapt dataframe doesn't have all the columns
            if p != -1:
                before_V[p] = row['total']
        for index, row in df_undergoing.iterrows():
            p = name_Value.index(
                row['_id'])  ## to adapt dataframe doesn't have all the columns
            if p != -1:
                undergoing_V[p] = row['total']
        for index, row in df_after.iterrows():
            p = name_Value.index(
                row['_id'])  ## to adapt dataframe doesn't have all the columns
            if p != -1:
                after_V[p] = row['total']

        raw_data = {
            Utils.TimeLine_Col: [
                Get_Display_DateRange(oneYearBefore, oneDayBefore),
                Get_Display_DateRange(renovateStart, renovationEnd),
                Get_Display_DateRange(oneDayAfter, oneYearAfter)
            ],
            Utils.Negative_Col: [before_V[0], undergoing_V[0], after_V[0]],
            Utils.Neutral_Col: [before_V[1], undergoing_V[1], after_V[1]],
            Utils.Positive_Col: [before_V[2], undergoing_V[2], after_V[2]]
        }

        df = pd.DataFrame(raw_data,
                          columns=[
                              Utils.TimeLine_Col, Utils.Negative_Col,
                              Utils.Neutral_Col, Utils.Positive_Col
                          ])

        # Create a figure with a single subplot
        f, ax = plt.subplots(1, figsize=(6, 3))

        # Set bar width at 1
        bar_width = 1

        # positions of the left bar-boundaries
        bar_l = [i for i in range(len(df[Utils.Negative_Col]))]

        # positions of the x-axis ticks (center of the bars as bar labels)
        tick_pos = [i + (bar_width / 2) for i in bar_l]

        # Create the total score for each participant
        totals = [
            i + j + k for i, j, k in zip(df[Utils.Negative_Col], df[
                Utils.Neutral_Col], df[Utils.Positive_Col])
        ]

        # Create the percentage of the total score the negative value for each participant was
        negative_R = [
            i / j * 100 for i, j in zip(df[Utils.Negative_Col], totals)
        ]

        # Create the percentage of the total score the neutral value for each participant was
        neutral_R = [
            i / j * 100 for i, j in zip(df[Utils.Neutral_Col], totals)
        ]

        # Create the percentage of the total score the positive value for each participant was
        positive_R = [
            i / j * 100 for i, j in zip(df[Utils.Positive_Col], totals)
        ]

        # Create a bar chart in negative
        p1 = ax.bar(
            bar_l,
            # using negative_R data
            negative_R,
            # labeled
            label=Utils.Before_Renovation_Col,
            # with alpha
            alpha=0.9,
            # with color orange
            color=Utils.Orange_Color,
            # with bar width
            width=bar_width,
            # with border color
            edgecolor='white')

        # Create a bar chart in position bar_1
        p2 = ax.bar(
            bar_l,
            # using neutral_R data
            neutral_R,
            # with negative_R
            bottom=negative_R,
            # labeled
            label=Utils.Undergoing_Renovation_Col,
            # with alpha
            alpha=0.9,
            # with color light green
            color=Utils.Light_Green_Color,
            # with bar width
            width=bar_width,
            # with border color
            edgecolor='white')

        # Create a bar chart in position bar_1
        p3 = ax.bar(
            bar_l,
            # using positive_R data
            positive_R,
            # with negative_R and neutral_R on bottom
            bottom=[i + j for i, j in zip(negative_R, neutral_R)],
            # labeled
            label=Utils.After_Renovation_Col,
            # with alpha
            alpha=0.9,
            # with color blue
            color=Utils.Blue_Color,
            # with bar width
            width=bar_width,
            # with border color
            edgecolor='white')

        # Set the ticks to be Stages
        plt.xticks(tick_pos, df[Utils.TimeLine_Col])

        def to_percent(y, position):
            # Ignore the passed in position. This has the effect of scaling the default
            # tick locations.
            s = str(y)

            # The percent symbol needs escaping in latex
            if matplotlib.rcParams['text.usetex'] is True:
                return s + r'$\%$'
            else:
                return s + '%'

        formatter = FuncFormatter(to_percent)
        # Set the formatter
        plt.gca().yaxis.set_major_formatter(formatter)
        for i, v in enumerate(df[Utils.Negative_Col]):
            ax.text(v, i, str(v), color='blue', fontweight='bold')
        kw = Get_Append_Keyword_Title(keyword)
        plt.title(functionName + ('\nSentiment Comparison' +
                                  kw if kw != '' else ' Sentiment Comparison'),
                  loc='center')
        # Let the borders of the graphic
        plt.xlim(
            [min(tick_pos) - bar_width * 1.1,
             max(tick_pos) + bar_width * 0.4])
        plt.ylim(-5, 105)
        plt.text(x=-0.1,
                 y=negative_R[0] / 2,
                 text=Utils.Format_To_Percentage_With_OneFloatPoint(
                     negative_R[0]),
                 ha='left',
                 s=40)
        plt.text(x=0.9,
                 y=negative_R[1] / 2,
                 text=Utils.Format_To_Percentage_With_OneFloatPoint(
                     negative_R[1]),
                 ha='left',
                 s=40)
        plt.text(x=2,
                 y=negative_R[2] / 2,
                 text=Utils.Format_To_Percentage_With_OneFloatPoint(
                     negative_R[2]),
                 ha='left',
                 s=40)

        # rotate axis labels
        plt.setp(plt.gca().get_xticklabels(),
                 rotation=0,
                 horizontalalignment='right')

        plt.legend((p3[0], p2[0], p1[0]),
                   (Utils.Positive_Col, Utils.Neutral_Col, Utils.Negative_Col),
                   loc='center right')

        # shot plot
        plt.show()
    else:
        print('no record found')