Ejemplos de webcrawl en Python, ejemplos de bs4.webcrawl en Python

Ejemplo n.º 1

0

Mostrar archivo

def interactWithPhotos(driver, interactionsTillNow, blackListedUsers,
                       archiveUsers):

    interactionCount = 0

    # Find all photos in the current page
    # photolist = driver.find_elements_by_css_selector(articleClassSelector)
    photolist = driver.find_elements_by_tag_name("article")

    # Find like and archive button classes
    optionList = webcrawl(photolist[0].get_attribute('innerHTML'),
                          "html.parser").contents[2].contents[0].findAll("a")

    likeButtonClass = str(optionList[0].attrs["class"])
    archiveButtonClass = str(optionList[2].attrs["class"])

    likeButtonClassSelector = parser.concatenate2(likeButtonClass)
    archiveButtonClassSelector = parser.concatenate2(archiveButtonClass)

    likeButtons = driver.find_elements_by_css_selector(likeButtonClassSelector)
    archiveButtons = driver.find_elements_by_css_selector(
        archiveButtonClassSelector)

    for i in range(0, len(photolist)):

        # Return, if we reached max interactions allowed
        if (interactionsTillNow +
                interactionCount) == CONSTANTS.MAX_INTERACTIONS:
            break

        # Find the username to which the photo belongs
        photoContainer = photolist[i].get_attribute('innerHTML')
        usernameOfPhoto = str(
            webcrawl(photoContainer, "html.parser").header.contents[1].
            contents[0].contents[0].contents[0].contents[0])

        # Don't interact with black listed users
        if usernameOfPhoto in blackListedUsers:
            utils.logMessage("User: "******" is blacklisted")
            continue

        shouldLike = shouldlikePhoto(photoContainer)
        shouldArchive = shouldArchiveThePhoto(photoContainer, usernameOfPhoto,
                                              archiveUsers)

        if shouldLike:
            likeButtons[i].click()
            utils.logMessage("Liked photo of user: "******"Saved photo of user: "******"Interaction Count: " +
                             str(interactionCount + interactionsTillNow))

    return interactionCount

Ejemplo n.º 2

0

Mostrar archivo

def getData(meaning):
	url = "https://wordsinasentence.com/" + meaning + "-in-a-sentence/"
	print("Making a request to the URL: " + url)

	return_val = u''

	try:
		# Put some headers
		headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
		data = requests.get(url, headers=headers)
		soup = webcrawl(data.text, "html.parser")
		content = soup.find("div", {"class": "thecontent clearfix"})

		list_of_p = content.find_all("p")
		for p_tag in list_of_p:
			# print(p_tag)
			# print("\n\n\n")
			if len(p_tag.contents) > 0:
				unicode_string = p_tag.find(text=True, recursive=False)
				if unicode_string is None:
					continue
				else:
					# print("Unicode string is: " + unicode_string)
					if "vocabulary videos" not in unicode_string and "YouTube channel" not in unicode_string:
						return_val += unicode_string + u"\n"

		return return_val

	except Exception as e:
		print("Exception raised in getData(" + meaning + ") method: " + str(e))
		return return_val

Ejemplo n.º 3

0

Mostrar archivo

def getStaus(name):
    url = "https://www.quora.com/profile/" + name
    print(url)

    try:
        # Put appropriate headers here later
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
        }
        data = requests.get(url, headers=headers)

        soup = webcrawl(data.text, "html.parser")

        statusTag = soup.find("span", \
                              class_="IdentityCredential UserCredential")

        if hasattr(statusTag, "contents"):
            return statusTag.contents[0]
        else:
            return "No status found"

    except Exception as e:
        print(e)
        print('Extra info:' + '\nURL: ' + url)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: login.py Proyecto: petrenkosv/python_programs

def login(url, driver):

    credentials = getCredentials(sys.argv[1])

    loginPageData = openLoginPage(driver, url)

    # Find the submit button on login page
    submitClassName = parser.findSubmitButton(loginPageData)
    submitClassName = parser.concatenate1(submitClassName)

    # Find username and password
    username = driver.find_element_by_name("username")
    password = driver.find_element_by_name("password")
    username.send_keys(credentials["username"])
    password.send_keys(credentials["password"])

    # Do the login
    driver.find_element_by_css_selector(submitClassName).submit()
    # Wait for the page to load
    time.sleep(3 * CONSTANTS.WAIT_TIME)

    homePageSrc = driver.page_source
    homePageData = str(webcrawl(homePageSrc, "html.parser"))

    return homePageData

Ejemplo n.º 5

0

Mostrar archivo

def shouldlikePhoto(photoContainer):

    # Find whether we have already liked the photo or not
    likeStatusContent = str(
        webcrawl(photoContainer,
                 "html.parser").contents[2].contents[0].contents[0])
    return bool(re.search(CONSTANTS.WHITE_LIKE_BUTTON_CLASS,
                          likeStatusContent))

Ejemplo n.º 6

0

Mostrar archivo

def shouldArchiveThePhoto(photoContainer, usernameOfPhoto, archiveUsers):

    if usernameOfPhoto in archiveUsers:
        # Find whether we have already saved the photo or not
        archiveStatusContent = str(
            webcrawl(photoContainer,
                     "html.parser").contents[2].contents[0].contents[2])
        return bool(
            re.search(CONSTANTS.WHITE_ARCHIVE_BUTTON_CLASS,
                      archiveStatusContent))
    else:
        return False

Ejemplo n.º 7

0

Mostrar archivo

def watchStories(driver):

    pageData = driver.find_element_by_tag_name("body").get_attribute(
        'innerHTML')
    storiesButtonClass = parser.findStoriesButton(
        str(webcrawl(pageData, "html.parser")))

    if parser.containsSpaces(storiesButtonClass):
        storiesButtonClassSelector = parser.concatenate2(storiesButtonClass)
        driver.find_elements_by_css_selector(
            storiesButtonClassSelector).click()
    else:
        driver.find_element_by_class_name(storiesButtonClass).click()

Ejemplo n.º 8

0

Mostrar archivo

Archivo: login.py Proyecto: petrenkosv/python_programs

def openLoginPage(driver, url):

    # By default we go to the sign-up page
    driver.get(url)
    # Wait for the page to load
    time.sleep(CONSTANTS.WAIT_TIME)
    '''
		On the sign-up page we need to
		find the login button and click
		it, so that we go to login page

	'''
    signUpPageSrc = driver.page_source
    signUpPageData = str(webcrawl(signUpPageSrc, "html.parser"))

    loginButton = parser.findLoginButton(signUpPageData)
    driver.find_element_by_class_name(loginButton).click()
    # Wait for the page to load
    time.sleep(CONSTANTS.WAIT_TIME)

    loginPageSrc = driver.page_source
    loginPageData = str(webcrawl(loginPageSrc, "html.parser"))

    return loginPageData

Ejemplo n.º 9

0

Mostrar archivo

def parseQuoraQuestionFeed(filename):

    # Read the HTML file
    content = ""
    with open(filename, "r") as f:
        content = f.read()
    '''
		Start parsing the document for required entities:
		
		We require the following:

		(i) Question Name
		(ii) Existing Answers
		(iii) Number of followers

		Note: This is a hardcoded version of HTML class attributes and
		we need to change this to something concrete to make it error-prone
		so that the code doesn't break if Quora changes its UI or HTML
		skeleton. For instance, if Quora provides an API to access
		topic-question-feed then we do not need to follow this approach

	'''

    soup = webcrawl(content, "html.parser")

    questionList = soup.findAll(
        "div", {"class": "QuestionFeedStory FeedStory feed_item"})

    results = []
    for question in questionList:

        questionName = question.find('span', class_="ui_qtext_rendered_qtext")

        existingNumAnswers = question.find('a',
                                           class_="answer_count_prominent")
        if existingNumAnswers.text == "No answer yet":
            existingNumAnswers = 0
        else:
            existingNumAnswers = int(existingNumAnswers.text.split()[0])

        followButton = question.find(
            'div', class_="ItemComponent FollowActionItem primary_item")
        if followButton == None:
            followButton = question.find(
                'div', class_="FollowActionItem ItemComponent primary_item")

        followCountWrapper = followButton.find('span',
                                               class_="icon_action_bar-count")
        followerCount = followCountWrapper.find_all('span')[1].text

        # Prepare a dictionary and keep appending to the list

        result = {
            "questionName": questionName.text,
            "existingAnswers": existingNumAnswers,
            "followers": followerCount
        }

        results.append(result)

    return results