def scrape_data(user_id, scan_list, section, elements_path, save_status, file_names):
    """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
    page = []

    if save_status == 4:
        page.append(user_id)

    page += [user_id + s for s in section]

    for i, _ in enumerate(scan_list):
#        try:
        driver.get(page[i])
     
        if save_status != 3:
            utils.scroll(total_scrolls, driver, selectors, scroll_time, dbid)

        data = bs(driver.page_source, 'lxml').find_all('div', attrs={"class":"du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"})
        if len(data) == 0:
            driver.refresh()
            time.sleep(0.5)
            driver.find_element_by_xpath("//a[contains(text(),'Timeline')]").click()
            time.sleep(0.5)
            driver.find_element_by_xpath("//a[contains(text(),'Timeline')]").click()
            time.sleep(3)
            data = bs(driver.page_source, 'lxml').find_all('div', attrs={"class":"du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"})
        save_to_file(file_names[i], data, save_status, i)
Exemple #2
0
def getTrends(driver):
    driver.get("https://www.youtube.com/feed/trending")
    utils.scroll(driver, numScrolls=20)
    soup = BeautifulSoup(driver.page_source, "html.parser")
    vids = []
    for k in soup.find_all('div', id='grid-container'):
        vids += k.find_all('ytd-video-renderer')
    videos = utils.getVideoFromSearch(vids)
    return videos
Exemple #3
0
def downloadImagesByTag(tag, maxImages, isClosing):
    driver = webdriver.Firefox()
    driver.get("https://www.instagram.com/")

    utils.login(driver, username, password)
    searchByTag(driver, tag, 1)
    utils.scroll(driver, 1)
    #target all the link elements on the page
    numberOfImagesDownloaded = download.downloadImages(driver, maxImages, tag)
    print(f'Downloaded {numberOfImagesDownloaded} images')
    if isClosing:
        utils.end(driver, 1)
Exemple #4
0
def scrape_data(user_id, scan_list, section, elements_path, save_status,
                file_names):
    """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
    page = []

    if save_status == 4:
        page.append(user_id)

    page += [user_id + s for s in section]

    for i, _ in enumerate(scan_list):
        try:
            driver.get(page[i])

            if ((save_status == 0) or (save_status == 1)
                    or (save_status
                        == 2)):  # Only run this for friends, photos and videos

                # the bar which contains all the sections
                sections_bar = driver.find_element_by_xpath(
                    selectors.get("sections_bar"))

                if sections_bar.text.find(scan_list[i]) == -1:
                    continue

            if save_status != 3:
                utils.scroll(total_scrolls, driver, selectors, scroll_time,
                             dbid)

            data = bs(driver.page_source, 'lxml').find_all(
                'div', attrs={"class": "du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"})
            if len(data) == 0:
                driver.refresh()
                time.sleep(0.5)
                driver.find_element_by_xpath(
                    "//a[contains(text(),'Timeline')]").click()
                time.sleep(0.5)
                driver.find_element_by_xpath(
                    "//a[contains(text(),'Timeline')]").click()
                time.sleep(3)
                data = bs(driver.page_source, 'lxml').find_all(
                    'div',
                    attrs={"class": "du4w35lb k4urcfbm l9j0dhe7 sjgh65i0"})
            save_to_file(file_names[i], data, save_status, i)

        except Exception:
            print(
                "Exception (scrape_data)",
                str(i),
                "Status =",
                str(save_status),
                sys.exc_info()[0],
            )
Exemple #5
0
def scrape_data(url, scan_list, section, elements_path, save_status,
                file_names):
    """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
    page = []

    if save_status == 4 or save_status == 5:
        page.append(url)

    page += [url + s for s in section]

    for i, _ in enumerate(scan_list):
        try:
            driver.get(page[i])

            if ((save_status == 0) or (save_status == 1)
                    or (save_status
                        == 2)):  # Only run this for friends, photos and videos

                # the bar which contains all the sections
                sections_bar = driver.find_element_by_xpath(
                    selectors.get("sections_bar"))

                if sections_bar.text.find(scan_list[i]) == -1:
                    continue

            if save_status != 3:
                utils.scroll(total_scrolls, driver, selectors, scroll_time)
                pass

            data = driver.find_elements_by_xpath(elements_path[i])

            save_to_file(file_names[i], data, save_status, i)

        except Exception:
            print(
                "Exception (scrape_data)",
                str(i),
                "Status =",
                str(save_status),
                sys.exc_info()[0],
            )
Exemple #6
0
def scrape_data(url, scan_list, section, elements_path, save_status,
                file_names):
    """Given some parameters, this function can scrap friends/photos/videos/about/posts(statuses) of a profile"""
    page = []
    pos = -1
    if url[-1] == '/':
        pos = -2
    user_name = url.split("/")[pos]

    print("user_name: ", user_name)

    if save_status == 4 or save_status == 5:
        page.append(url)

    page += [url + s for s in section]

    for i, _ in enumerate(scan_list):

        try:
            driver.get(page[i])

            if save_status != 3:
                utils.scroll(total_scrolls, driver, selectors, scroll_time)
                pass

            data = driver.find_elements_by_xpath(elements_path[i])

            save_to_file(file_names[i], data, save_status, i, user_name)

        except Exception:
            print(
                "Exception (scrape_data)",
                str(i),
                "Status =",
                str(save_status),
                sys.exc_info()[0],
            )
Exemple #7
0
def get_all_page_data(url, is_community=False):

    name = url.split("/")[-1] if len(
        url.split("/")[-1]) > 0 else url.split("/")[-2]

    if is_community:
        name = os.path.join(name, "community")
        url = url + "/community"

    data_path = os.path.join(".", "data")
    if not os.path.exists(data_path):
        os.mkdir(data_path)

    page_data_path = os.path.join(data_path, name)
    if not os.path.exists(page_data_path):
        os.mkdir(page_data_path)

    should_scrape_headless = is_community == False
    driver = initialize_driver(args.chrome,
                               args.windows,
                               is_headless=should_scrape_headless)

    driver.get(url)

    page_name = get_text(driver, './/a[@class="_64-f"]')

    print(f"Scrolling {url} until {cutoff_date}")

    scroll(driver, pd.to_datetime(cutoff_date))

    posts = driver.find_elements_by_xpath(
        '//div[contains(@class, "userContentWrapper")]')

    post_links = [get_post_links(post) for post in tqdm(posts)]

    post_links = list(set(post_links))

    with open(os.path.join(page_data_path, 'post_links.json'), 'w') as f:
        json.dump(post_links, f)

    driver.quit()

    print(f"Now scraping {len(post_links)} posts from {name}")

    for i, post_link in enumerate(post_links):

        if not is_string_url(post_link):
            continue

        print(f"Scraping {post_link}")

        driver = initialize_driver(args.chrome, args.windows)

        driver.get(post_link)

        if "/videos/" in post_link:
            post_type = "videos"
        elif "/photos/" in post_link:
            post_type = "photos"
        elif "/posts/" in post_link:
            post_type = "posts"
        elif "/notes/" in post_link:
            post_type = "notes"
        else:
            post_type = "other"

        if post_type == "notes":
            post_element = driver.find_element_by_xpath(
                './/div[contains(@class, "fb_content")]')
        else:
            post_element = driver.find_element_by_xpath(
                './/div[contains(@class, "userContentWrapper")]')

        post_data = get_post_data(driver, post_element, post_type)

        post_data["page_name"] = page_name

        with open(os.path.join(page_data_path, f'page_post_{i}.json'),
                  'w') as f:
            json.dump(post_data, f)

        driver.quit()

    if not is_community:
        get_all_page_data(url, is_community=True)
def start_crawler(url, email, password, depth):

    #Check if page has loaded
    def page_loaded(driver):
        return driver.find_element_by_tag_name("body") != None

    # Log and sign into twitter - definde or redefine driver
    def log_in():
        # Opening the web browser and the twitter page
        driver = webdriver.Chrome(
            executable_path=
            r'F:\Dokumentation\Programme\ChromeDriver\chromedriver.exe')
        driver.get(url)
        wait = ui.WebDriverWait(driver, 10)
        wait.until(page_loaded)

        #Signing in
        actions = ActionChains(driver)
        actions.send_keys(email)
        actions.send_keys(Keys.TAB)
        actions.send_keys(password)
        actions.send_keys(Keys.ENTER)
        actions.perform()
        time.sleep(5)
        return driver

    # Opening the web browser and the twitter page
    driver = log_in()

    # Get first node
    html_doc = driver.page_source
    soup = BeautifulSoup(html_doc, 'html.parser')

    # Create Anchor
    id = len(models.GephiNode.objects.all()) + 1
    print("id: ", id)
    label = soup.find(
        "a", {
            "class": "ProfileHeaderCard-nameLink u-textInheritColor js-nav"
        }).get_text()
    print("label: ", label)
    try:
        fan_count = soup.findAll("span",
                                 {"class": "ProfileNav-value"})[2].get_text()
    except:
        fan_count = 0

    print("fan_count: ", fan_count)
    handle = soup.find("b", {"class": "u-linkComplex-target"}).get_text()
    print("handle: ", handle)
    Anchor = models.GephiNode(id=id,
                              label=label,
                              fan_count=fan_count,
                              handle=handle)
    Anchor.create()
    print("Anchor: ", Anchor.id, Anchor.label)

    # Go to following following
    driver.get(url + "/following")
    wait = ui.WebDriverWait(driver, 10)
    wait.until(page_loaded)

    try:
        lnks_cnt = int(
            soup.findAll("span",
                         {"class": "ProfileNav-value"})[1].get_text().replace(
                             ".", ""))
    except:
        lnks_cnt = 1500
    print(lnks_cnt)
    links = driver.find_elements_by_xpath(
        "//a[@class='ProfileCard-bg js-nav']")
    print(len(links))

    test = 1
    while len(links) < lnks_cnt - 2 and test < 1500:
        utils.scroll(driver)
        links = driver.find_elements_by_xpath(
            "//a[@class='ProfileCard-bg js-nav']")
        print("following: ", lnks_cnt)
        print("links: ", len(links))
        test += 1
    utils.scroll(driver)
    links = driver.find_elements_by_xpath(
        "//a[@class='ProfileCard-bg js-nav']")

    newUrlList = list()
    for lnk in links:
        newUrlList.append(lnk.get_attribute("href"))

    print("links unshuffeld: ", newUrlList)
    shuffle(newUrlList)
    print("links shuffeld: ", newUrlList)

    # Check everyone the Anchor follows
    print("link count: ", len(links))
    print("edge count: ",
          len(models.GephiEdge.objects.filter(source=Anchor.id)))
    if len(newUrlList) > len(
            models.GephiEdge.objects.filter(source=Anchor.id)):

        #First Followed
        for link in newUrlList:
            if link == "https://twitter.com/account/suspended":
                newUrlList = [
                    x for x in newUrlList
                    if x != "https://twitter.com/account/suspended"
                ]
            else:
                try:
                    driver.get(link)
                except:
                    driver = log_in()
                    driver.get(link)

                # Get HTML
                wait = ui.WebDriverWait(driver, 10)
                wait.until(page_loaded)
                html_doc = driver.page_source
                soup = BeautifulSoup(html_doc, 'html.parser')

                #Create Node
                id = len(models.GephiNode.objects.all()) + 1
                print("id: ", id)
                label = soup.find(
                    "a", {
                        "class":
                        "ProfileHeaderCard-nameLink u-textInheritColor js-nav"
                    }).get_text()
                print("label: ", label)
                try:
                    fan_count = soup.findAll(
                        "span", {"class": "ProfileNav-value"})[2].get_text()
                except:
                    fan_count = 0
                print("fan_count: ", fan_count)
                handle = soup.find("b", {
                    "class": "u-linkComplex-target"
                }).get_text()
                print("handle: ", handle)
                tempNode = models.GephiNode(id=id,
                                            label=label,
                                            fan_count=fan_count,
                                            handle=handle)
                tempNode.create()

                #Create Edge
                print("source: ", Anchor.id)
                print("target: ", tempNode.id)
                print("type: ", "Directed")
                edge_id = len(models.GephiEdge.objects.all())
                print("id: ", edge_id)

                firstEdge = models.GephiEdge(source=Anchor.id,
                                             target=tempNode.id,
                                             type="Directed",
                                             id=edge_id,
                                             weight=1)
                firstEdge.create()

                # Break if list is complete in DB
                print("link count: ", len(links))
                print("edge count: ",
                      len(models.GephiEdge.objects.filter(source=Anchor.id)))
                if len(newUrlList) == len(
                        models.GephiEdge.objects.filter(source=Anchor.id)):
                    break

            print("Nodes: ", len(models.GephiNode.objects.all()))
            print("Edges: ", len(models.GephiEdge.objects.all()))
            print(
                "Difference: ",
                len(models.GephiEdge.objects.all()) -
                len(models.GephiNode.objects.all()))

    driver.close()

    # Rekcursion

    print("depth: ", depth)
    if depth > 0:
        for uri in newUrlList:
            try:
                start_crawler(uri, email, password, depth - 1)
            except:
                try:
                    start_crawler(uri, email, password, depth - 1)
                except:
                    pass
Exemple #9
0
 def open_newsfeed(self):
     self.driver.get(self.url)
     scroll(self.driver, 5)
Exemple #10
0
def twi_hashtag_originator(hashtag, minYear=2006):
    # cache the hashtag for future use (screenshot)
    cache = "_".join(hashtag.split(" ")).replace("#", "")

    # checking wether argument is valid
    if hashtag is None or hashtag == "":
        print("Wrong argument: ", hashtag)
        return None

    # separate the words and remove #
    h = [j for j in hashtag.split(" ") if j != ""]
    check = lambda x: x[1:] if x.startswith("#") else x
    h = [check(j).lower() for j in h]

    # forming the URL part
    hashtag = f"(%23{h[0]}"
    for rest in h[1:]:
        hashtag += f" AND %23{rest}"
    hashtag += ")"

    # preliminary code
    soup = BeautifulSoup("", "html.parser")
    createStorage("twitter")
    screen = False

    # close driver as soon as work done
    with setDriver(headless=False) as driver:
        wait = WebDriverWait(driver, 3)
        year = minYear

        # checking if hashtag is actually ever been used
        url = f"https://twitter.com/search?q={hashtag}"
        driver.get(url)
        articles = returnAllTweets(wait)
        if len(articles) == 0:
            print("Hashtag has not been used in any tweets that are public")
            return None

        # check starting from minYear
        articles = []
        while (len(articles) == 0):
            url = f"https://twitter.com/search?q={hashtag}%20until%3A{str(year)}-01-01&src=typed_query"
            driver.get(url)
            articles = returnAllTweets(wait)
            year += 1
        scroll(driver, fastScroll=False)
        ti = returnAllTweets(wait)

        if len(ti) == 0:
            print("Something went wrong.")
            return None

        # remove tweets not containing the hashtag (This occurs if the tweet is a parent tweet of the hashtag containing tweet)
        t = []
        for post in ti:
            inner = post.get_attribute("innerHTML").lower()
            good = True
            for each in h:
                if f">#{each}</a>" not in inner:
                    good = False
                    break
            if good:
                t.append(post)

        # sort all posts by retrieving the timestamp
        timed = []
        for post in t:
            try:
                s = post.find_element_by_xpath('.//time')
                s = s.get_attribute("datetime")
                if s:
                    try:
                        timed.append({
                            "timestamp":
                            datetime.datetime.strptime(
                                s, "%Y-%m-%dT%H:%M:%S.%fZ"),
                            "elem":
                            post
                        })
                    except:
                        pass
            except NoSuchElementException:
                pass
        t = sorted(timed, key=lambda x: x["timestamp"])

        # element of earliest post selected and screenshot taken
        if len(t) == 0:
            print("Something is seriously wrong")
            return None

        t = t[0]["elem"]
        try:
            with open(f"data/twitter/{cache}.png", "wb") as filex:
                filex.write(t.screenshot_as_png)
            screen = True
        except:
            print("Screenshot was unsuccessfull")
        t = t.get_attribute("innerHTML")
        soup = BeautifulSoup(t, "html.parser")

    result = {
        "poster": {
            "full_name": "",
            "username": "",
            "url": "",
            "id": "",
            "profile_image_url": ""
        },
        "timestamp": "",
        "post_link": "",
        "post_text": "",
        "embed": "",
        "screenshot": ""
    }
    a = soup.find("a")
    if a:
        result["poster"]["url"] = completeTwitterLink(a.get('href', ""))
        result["poster"]["username"] = result["poster"]["url"].replace(
            "https://www.twitter.com/", "")
        k = a.findNext("a", {"href": a.get("href", "")})
        if k:
            result["poster"]["full_name"] = k.text.partition("@")[0]
        k = a.find("img")
        if k:
            result["poster"]["profile_image_url"] = k.get("src", "")
    a = soup.find(lambda tag: tag.name == 'a' and tag.find("time"))
    if a:
        link = completeTwitterLink(a.get("href", ""))
        result["post_link"] = link
        result[
            "embed"] = f'<blockquote class="twitter-tweet"><a href="{link}"></a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>'
        k = a.find("time")
        if k:
            result["timestamp"] = k.get("datetime", "")
        soup = BeautifulSoup(str(soup).partition(str(a))[2], "html.parser")
        result["post_text"] = " ".join([s.text for s in soup.find_all("span")])
    if screen:
        result["screenshot"] = os.getcwd() + f"/data/twitter/{cache}"
    return result
Exemple #11
0
    def getPosts(self):
        # navigating to page
        newUrl = self.url + "/community"
        if self.driver.current_url != newUrl:
            self.driver.get(newUrl)
            utils.scroll(self.driver)
            sleep(1)

        content = str(self.driver.page_source)

        # open comments
        comm = re.findall(r'View all [0-9]+ comments', content)
        comm += re.findall(r'View comment', content)
        for co in comm:
            elements = self.driver.find_elements_by_xpath(
                "//*[text()[contains(.,'" + co + "')]]")
            for element in elements:
                self.driver.execute_script("arguments[0].click();", element)
        try:
            self.wait.until(
                EC.visibility_of_all_elements_located(
                    (By.TAG_NAME, 'ytd-comment-thread-renderer')))
        except TimeoutException:
            pass

        sleep(1)
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        blocks = soup.find_all('ytd-backstage-post-thread-renderer')
        posts = []

        # extracting post one by one
        for block in blocks:
            post = self.initiatePostSkeleton()

            # extracting author link and picture
            a = block.find('div', id='author-thumbnail')
            if a:
                a = a.find('a')
                if a:
                    post["post_author"]["link"] = utils.completeYoutubeLink(
                        a.get('href', ''))
                    img = a.find('img')
                    if img:
                        post["post_author"]["media_directory"] = img.get(
                            'src', "")

            # extracting author name
            a = block.find('a', id='author-text')
            if a:
                post["post_author"]["name"] = utils.beautifyText(a.text)

            # extracting post link and pubish time
            a = block.find('yt-formatted-string', id='published-time-text')
            if a:
                a = a.find('a')
                if a:
                    post["timestamp"] = a.text
                    post["post_link"] = utils.completeYoutubeLink(
                        a.get('href', ''))

            # extracting text
            a = block.find('yt-formatted-string', id='content-text')
            if a:
                post["post_type"]["is_text"] = True
                post["post_text"] = a.text.replace('\ufeff', '')

            # extracting vote data
            v = block.find('ytd-backstage-poll-renderer', id='poll-attachment')
            if v:
                a = v.find('yt-formatted-string', id='vote-info')
                if a:
                    count = a.text.split()[0]
                    if count.isnumeric():
                        post["votes"]["total_votes"] = int(count)
                        post["post_type"]["is_vote"] = True
                        post["votes"]["vote_text"] = post["post_text"]
                        a = v.find_all('yt-formatted-string',
                                       {'class': 'choice-text'})
                        for choice in a:
                            post["votes"]["vote_options"].append(choice.text)

            # getting likes
            a = block.find('span', id='vote-count-middle')
            if a:
                count = utils.beautifyText(a.text)
                if count.isnumeric():
                    post["post_likes"] = int(count)

            # getting links
            a = []
            srcs = []
            b = block.find('div', id='content')
            c = block.find('div', id='content-attachment')
            for y in [b, c]:
                if y:
                    a += y.find_all('a')
                    srcs += y.find_all('img')
            for link in a:
                link = utils.completeYoutubeLink(link.get('href', ""))
                if "/watch?" in link:
                    post["post_type"]["is_video"] = True
                    post["video_links"].append(link)
                elif link not in [
                        post["post_link"], self.url, '', self.origin
                ]:
                    post["other_links"].append(link)
            for img in srcs:
                post["post_type"]["is_picture"] = True
                src = img.get('src', '')
                if src != '':
                    post["picture_links"].append(src)

            # removing duplicate links
            for option in ["picture_links", "video_links", "other_links"]:
                post[option] = list(set(post[option]))

            # getting comments
            for com in block.find_all('ytd-comment-thread-renderer'):
                comment = self.initiateCommentSkeleton()
                a = com.find('div', id='author-thumbnail')
                if a:
                    a = a.find('a')
                    if a:
                        comment["comment_author"][
                            "link"] = utils.completeYoutubeLink(
                                a.get('href', ''))
                        img = a.find('img')
                        if img:
                            comment["comment_author"][
                                "media_directory"] = img.get('src', "")
                a = com.find('a', id='author-text')
                if a:
                    comment["comment_author"]["name"] = utils.beautifyText(
                        a.text)
                a = com.find('yt-formatted-string',
                             {'class': 'published-time-text'})
                if a:
                    a = a.find('a')
                    if a:
                        comment["timestamp"] = a.text
                        comment["comment_url"] = utils.completeYoutubeLink(
                            a.get('href', ''))
                a = com.find('yt-formatted-string', id='content-text')
                if a:
                    comment["comment_text"] = a.text.replace('\ufeff', '')
                    for link in a.find_all('a'):
                        comment["comment_links"].append(
                            utils.completeYoutubeLink(link.get('href', '')))
                a = com.find('span', id='vote-count-middle')
                if a:
                    count = utils.beautifyText(a.text)
                    if count.isnumeric():
                        comment["comment_likes"] = int(count)
                post["post_comments"].append(comment)
            posts.append(post)
        self.profile["posts"] = posts
        return posts