def Main(site,db_name, runDate, targetDate): mongodb = dh.ToMongoDB(*dh.AWS_MongoDB_Information()) dbname = db_name useDb = dh.Use_Database(mongodb, dbname) slack = cb.Slacker(cb.slacktoken()) slack.chat.post_message('# general', 'Start : {}, targetData : {} '.format(site, targetDate)) startTime = datetime.now() newsDf, commentsDf = Main_Naver(targetDate) newsCollectionName = 'newsNaver2018' middleTime = datetime.now() runningTime = middleTime = middleTime - startTime print ('Start Uploading') useCollection_daum_news = dh.Use_Collection(useDb, newsCollectionName) useCollection_daum_news.insert_many(newsDf.to_dict('records')) useCollection_comment = dh.Use_Collection(useDb, 'comments2018') useCollection_comment.insert_many(commentsDf.to_dict('records')) print ('End Uploading') endTime = datetime.now() uploadTime = endTime - middleTime outcome_info = '{}, news : {}, comment : {}'.format(site, len(newsDf), len(commentsDf)) date_info = 'run date : {}, target date : {}'.format(runDate.strftime('%Y%m%d'), targetDate) time_info = 'running time : {}, uploading time'.format(runningTime, uploadTime) slack.chat.post_message('# general', outcome_info) slack.chat.post_message('# general', date_info) slack.chat.post_message('# general', time_info) slack.chat.post_message('# general', 'Complete Upload In AWS Mongodb') mongodb.close()
def Read_Comments2(row): import pandas as pd import Database_Handler as dh mongodb = dh.ToMongoDB(*dh.GCP_MongoDB_Information()) dbname = 'hy_db' useDB = dh.Use_Database(mongodb, dbname) commentCollection = dh.Use_Collection(useDB, 'comments') info = { 'site': row['site'], 'category': row['category'], 'date': row['date'], 'rank': int(row['rank']) } commentsForNews = commentCollection.find(info) commentsForNews = pd.DataFrame(list(commentsForNews)) realNumCount = commentsForNews.shape print(realNumCount) return commentsForNews
def GetNumberOfCommentInDB(row): import Database_Handler as dh from bson import ObjectId mongodb = dh.ToMongoDB(*dh.GCP_MongoDB_Information()) dbname = 'hy_db' useDB = dh.Use_Database(mongodb, dbname) commentCollection = dh.Use_Collection(useDB, 'comments') info = { 'site': row['site'], 'category': row['category'], 'date': row['date'], 'rank': row['rank'] } commentsForNews = commentCollection.find(info) realNumCount = commentsForNews.count() site = row['site'] oid = ObjectId(row['id']) if site == 'daum': newsCollection = dh.Use_Collection(useDB, 'newsDaum') else: newsCollection = dh.Use_Collection(useDB, 'newsNaver') if realNumCount != row['number_of_crawled_comment']: newsCollection.update_one( {'_id': oid}, {'$set': { 'real_number_of_comment': realNumCount }}) if row.name % 100 == 0: print(row.name)
if isElementPresent(driver, 'tag_relate') == False: keywords = 'NaN' else: element = WebDriverWait(driver, 3).until( EC.presence_of_element_located((By.CLASS_NAME, 'tag_relate'))) keywords = driver.find_elements_by_class_name('tag_relate') keywords = list(map(lambda x: x.text, keywords)) keywords = list(map(lambda x: re.sub('#', '', x), keywords)) driver.quit() return keywords if __name__ == '__main__': site = 'Naver' collection = 'newsNaver' mongodb = dh.ToMongoDB(*dh.AWS_MongoDB_Information()) dbname = 'hy_db' useDb = dh.Use_Database(mongodb, dbname) slack = cb.Slacker(cb.slacktoken()) useCollection = dh.Use_Collection(useDb, collection) dataList = useCollection.find({'site': site}) for data in dataList: if not 'keywords' in data.keys(): keywords = SearchKeywordsFromDaumForNaver2(data['title']) useCollection.update({"_id": data['_id']}, {'$set': { "keywords": keywords }}) print(keywords) elif 'keywords' in data.keys() and data['keywords'] == 'NaN':