Beispiel #1
0
def runCrawl(limitNum=0, queryList=[], is_all_comments=False):
    browser = Browser("driver/chromedriver")
    for query in queryList:
        browser.clearLink()
        makeDir("data")
        makeDir("data/" + query)
        mUrl = ""
        if query[0] == "#":
            mUrl = "https://www.instagram.com/explore/tags/" + query[
                1:] + "/?hl=en"
        else:
            mUrl = "https://www.instagram.com/" + query + "/?hl=en"
        browser.goToPage(mUrl)
        print("collecting url of " + query + "...")
        browser.scrollPageToBottomUntilEnd(browser.collectDpageUrl, limitNum)
        print("finish scoll collecting!")

        print("collecting data...")
        slist = list(set(browser.urlList))
        for url in tqdm(slist):
            dirName = url.split("/")[4]
            # skip if already crawled
            if not makeDir("data/" + query + "/" + dirName):
                continue
            browser.goToPage(url)
            if is_all_comments:
                browser.expandComments()
            cur = browser.getPageSource()
            writeToFile("data/" + query + "/" + dirName + "/raw.html", [cur])
            infoData = cur.split("<meta content=")[1].split(" ")
            # extract data
            lang = extractLang(cur)
            # likes = extractLikes(infoData, lang)
            likes = extractLikes_cur(cur)
            comments = extractComments(infoData, lang)
            caption = extractCaption(cur)
            dateTime = extractDateTime(cur)
            commentMessages = extractCommentsMessage(cur)
            # print("likes:",likes," comments:", comments," caption:", caption,
            #     "commentMessages:", commentMessages, "dateTime:", dateTime)
            writeToFile("data/" + query + "/" + dirName + "/info.txt", [
                "likes: ", likes, "", "comments: ", comments, "", "caption: ",
                caption, "", "commentMessages: ", commentMessages, "",
                "dateTime: ", dateTime, ""
            ])
            # download image
            imageUrl = html.unescape(
                cur.split('meta property="og:image" content="')[1].split('"')
                [0])
            downloadImage(imageUrl,
                          "data/" + query + "/" + dirName + "/image.jpg")
            time.sleep(1)
        print("query " + query + " collecting finish")

    time.sleep(2)
    browser.driver.quit()
    print("FINISH!")
Beispiel #2
0
def runCrawl(limitNum=0, queryList=[], is_all_comments=False, userinfo={}):
    browser = Browser("driver/chromedriver")
    if userinfo != {}:
        print('Start logging in')
        browser.goToPage('https://www.instagram.com/accounts/login/?hl=en')
        if browser.log_in(userinfo):
            print('Success to log in')
        else:
            print('Fail to log in')
            return
    else:
        print('Continue Without logging in')
    for query in queryList:
        browser.clearLink()
        makeDir("data")
        makeDir("data/" + query)
        mUrl = ""
        if query[0] == "#":
            mUrl = "https://www.instagram.com/explore/tags/" + query[
                1:] + "/?hl=en"
        else:
            mUrl = "https://www.instagram.com/" + query + "/?hl=en"
        browser.goToPage(mUrl)
        print("collecting url of " + query + "...")
        browser.scrollPageToBottomUntilEnd(browser.collectDpageUrl, limitNum)
        print("finish scoll collecting!")

        print("collecting data...")
        slist = list(set(browser.urlList))
        for url in tqdm(slist):
            dirName = url.split("/")[4]
            # skip if already crawled
            if not makeDir("data/" + query + "/" + dirName):
                continue
            browser.goToPage(url)
            if is_all_comments:
                browser.expandComments()
            cur = browser.getPageSource()
            writeToFile("data/" + query + "/" + dirName + "/raw.html", [cur])
            infoData = BeautifulSoup(cur, "lxml")
            imageData = infoData.find("img", class_="FFVAD")
            # extract data
            likes = extractLikes(infoData)
            comments_list = extractComments(infoData)
            comments = comments_list.__len__()
            caption = extractCaption(imageData)
            dateTime = extractDateTime(infoData)
            commentMessages = extractCommentsMessage(comments_list)
            # print("likes:",likes," comments:", comments," caption:", caption,
            #     "commentMessages:", commentMessages, "dateTime:", dateTime)
            writeToFile("data/" + query + "/" + dirName + "/info.txt", [
                "likes: ", likes, "", "comments: ", comments, "", "caption: ",
                caption, "", "commentMessages: ", commentMessages, "",
                "dateTime: ", dateTime, ""
            ])
            # download image
            imageUrl = imageData.get("srcset")
            downloadImage(imageUrl,
                          "data/" + query + "/" + dirName + "/image.jpg")
            time.sleep(1)
        print("query " + query + " collecting finish")

    time.sleep(2)
    browser.driver.quit()
    print("FINISH!")