def runCrawl(limitNum=0, queryList=[], is_all_comments=False): browser = Browser("driver/chromedriver") for query in queryList: browser.clearLink() makeDir("data") makeDir("data/" + query) mUrl = "" if query[0] == "#": mUrl = "https://www.instagram.com/explore/tags/" + query[ 1:] + "/?hl=en" else: mUrl = "https://www.instagram.com/" + query + "/?hl=en" browser.goToPage(mUrl) print("collecting url of " + query + "...") browser.scrollPageToBottomUntilEnd(browser.collectDpageUrl, limitNum) print("finish scoll collecting!") print("collecting data...") slist = list(set(browser.urlList)) for url in tqdm(slist): dirName = url.split("/")[4] # skip if already crawled if not makeDir("data/" + query + "/" + dirName): continue browser.goToPage(url) if is_all_comments: browser.expandComments() cur = browser.getPageSource() writeToFile("data/" + query + "/" + dirName + "/raw.html", [cur]) infoData = cur.split("<meta content=")[1].split(" ") # extract data lang = extractLang(cur) # likes = extractLikes(infoData, lang) likes = extractLikes_cur(cur) comments = extractComments(infoData, lang) caption = extractCaption(cur) dateTime = extractDateTime(cur) commentMessages = extractCommentsMessage(cur) # print("likes:",likes," comments:", comments," caption:", caption, # "commentMessages:", commentMessages, "dateTime:", dateTime) writeToFile("data/" + query + "/" + dirName + "/info.txt", [ "likes: ", likes, "", "comments: ", comments, "", "caption: ", caption, "", "commentMessages: ", commentMessages, "", "dateTime: ", dateTime, "" ]) # download image imageUrl = html.unescape( cur.split('meta property="og:image" content="')[1].split('"') [0]) downloadImage(imageUrl, "data/" + query + "/" + dirName + "/image.jpg") time.sleep(1) print("query " + query + " collecting finish") time.sleep(2) browser.driver.quit() print("FINISH!")
def runCrawl(limitNum=0, queryList=[], is_all_comments=False, userinfo={}): browser = Browser("driver/chromedriver") if userinfo != {}: print('Start logging in') browser.goToPage('https://www.instagram.com/accounts/login/?hl=en') if browser.log_in(userinfo): print('Success to log in') else: print('Fail to log in') return else: print('Continue Without logging in') for query in queryList: browser.clearLink() makeDir("data") makeDir("data/" + query) mUrl = "" if query[0] == "#": mUrl = "https://www.instagram.com/explore/tags/" + query[ 1:] + "/?hl=en" else: mUrl = "https://www.instagram.com/" + query + "/?hl=en" browser.goToPage(mUrl) print("collecting url of " + query + "...") browser.scrollPageToBottomUntilEnd(browser.collectDpageUrl, limitNum) print("finish scoll collecting!") print("collecting data...") slist = list(set(browser.urlList)) for url in tqdm(slist): dirName = url.split("/")[4] # skip if already crawled if not makeDir("data/" + query + "/" + dirName): continue browser.goToPage(url) if is_all_comments: browser.expandComments() cur = browser.getPageSource() writeToFile("data/" + query + "/" + dirName + "/raw.html", [cur]) infoData = BeautifulSoup(cur, "lxml") imageData = infoData.find("img", class_="FFVAD") # extract data likes = extractLikes(infoData) comments_list = extractComments(infoData) comments = comments_list.__len__() caption = extractCaption(imageData) dateTime = extractDateTime(infoData) commentMessages = extractCommentsMessage(comments_list) # print("likes:",likes," comments:", comments," caption:", caption, # "commentMessages:", commentMessages, "dateTime:", dateTime) writeToFile("data/" + query + "/" + dirName + "/info.txt", [ "likes: ", likes, "", "comments: ", comments, "", "caption: ", caption, "", "commentMessages: ", commentMessages, "", "dateTime: ", dateTime, "" ]) # download image imageUrl = imageData.get("srcset") downloadImage(imageUrl, "data/" + query + "/" + dirName + "/image.jpg") time.sleep(1) print("query " + query + " collecting finish") time.sleep(2) browser.driver.quit() print("FINISH!")