def interactWithPhotos(driver, interactionsTillNow, blackListedUsers, archiveUsers): interactionCount = 0 # Find all photos in the current page # photolist = driver.find_elements_by_css_selector(articleClassSelector) photolist = driver.find_elements_by_tag_name("article") # Find like and archive button classes optionList = webcrawl(photolist[0].get_attribute('innerHTML'), "html.parser").contents[2].contents[0].findAll("a") likeButtonClass = str(optionList[0].attrs["class"]) archiveButtonClass = str(optionList[2].attrs["class"]) likeButtonClassSelector = parser.concatenate2(likeButtonClass) archiveButtonClassSelector = parser.concatenate2(archiveButtonClass) likeButtons = driver.find_elements_by_css_selector(likeButtonClassSelector) archiveButtons = driver.find_elements_by_css_selector( archiveButtonClassSelector) for i in range(0, len(photolist)): # Return, if we reached max interactions allowed if (interactionsTillNow + interactionCount) == CONSTANTS.MAX_INTERACTIONS: break # Find the username to which the photo belongs photoContainer = photolist[i].get_attribute('innerHTML') usernameOfPhoto = str( webcrawl(photoContainer, "html.parser").header.contents[1]. contents[0].contents[0].contents[0].contents[0]) # Don't interact with black listed users if usernameOfPhoto in blackListedUsers: utils.logMessage("User: "******" is blacklisted") continue shouldLike = shouldlikePhoto(photoContainer) shouldArchive = shouldArchiveThePhoto(photoContainer, usernameOfPhoto, archiveUsers) if shouldLike: likeButtons[i].click() utils.logMessage("Liked photo of user: "******"Saved photo of user: "******"Interaction Count: " + str(interactionCount + interactionsTillNow)) return interactionCount
def getData(meaning): url = "https://wordsinasentence.com/" + meaning + "-in-a-sentence/" print("Making a request to the URL: " + url) return_val = u'' try: # Put some headers headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} data = requests.get(url, headers=headers) soup = webcrawl(data.text, "html.parser") content = soup.find("div", {"class": "thecontent clearfix"}) list_of_p = content.find_all("p") for p_tag in list_of_p: # print(p_tag) # print("\n\n\n") if len(p_tag.contents) > 0: unicode_string = p_tag.find(text=True, recursive=False) if unicode_string is None: continue else: # print("Unicode string is: " + unicode_string) if "vocabulary videos" not in unicode_string and "YouTube channel" not in unicode_string: return_val += unicode_string + u"\n" return return_val except Exception as e: print("Exception raised in getData(" + meaning + ") method: " + str(e)) return return_val
def getStaus(name): url = "https://www.quora.com/profile/" + name print(url) try: # Put appropriate headers here later headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36' } data = requests.get(url, headers=headers) soup = webcrawl(data.text, "html.parser") statusTag = soup.find("span", \ class_="IdentityCredential UserCredential") if hasattr(statusTag, "contents"): return statusTag.contents[0] else: return "No status found" except Exception as e: print(e) print('Extra info:' + '\nURL: ' + url)
def login(url, driver): credentials = getCredentials(sys.argv[1]) loginPageData = openLoginPage(driver, url) # Find the submit button on login page submitClassName = parser.findSubmitButton(loginPageData) submitClassName = parser.concatenate1(submitClassName) # Find username and password username = driver.find_element_by_name("username") password = driver.find_element_by_name("password") username.send_keys(credentials["username"]) password.send_keys(credentials["password"]) # Do the login driver.find_element_by_css_selector(submitClassName).submit() # Wait for the page to load time.sleep(3 * CONSTANTS.WAIT_TIME) homePageSrc = driver.page_source homePageData = str(webcrawl(homePageSrc, "html.parser")) return homePageData
def shouldlikePhoto(photoContainer): # Find whether we have already liked the photo or not likeStatusContent = str( webcrawl(photoContainer, "html.parser").contents[2].contents[0].contents[0]) return bool(re.search(CONSTANTS.WHITE_LIKE_BUTTON_CLASS, likeStatusContent))
def shouldArchiveThePhoto(photoContainer, usernameOfPhoto, archiveUsers): if usernameOfPhoto in archiveUsers: # Find whether we have already saved the photo or not archiveStatusContent = str( webcrawl(photoContainer, "html.parser").contents[2].contents[0].contents[2]) return bool( re.search(CONSTANTS.WHITE_ARCHIVE_BUTTON_CLASS, archiveStatusContent)) else: return False
def watchStories(driver): pageData = driver.find_element_by_tag_name("body").get_attribute( 'innerHTML') storiesButtonClass = parser.findStoriesButton( str(webcrawl(pageData, "html.parser"))) if parser.containsSpaces(storiesButtonClass): storiesButtonClassSelector = parser.concatenate2(storiesButtonClass) driver.find_elements_by_css_selector( storiesButtonClassSelector).click() else: driver.find_element_by_class_name(storiesButtonClass).click()
def openLoginPage(driver, url): # By default we go to the sign-up page driver.get(url) # Wait for the page to load time.sleep(CONSTANTS.WAIT_TIME) ''' On the sign-up page we need to find the login button and click it, so that we go to login page ''' signUpPageSrc = driver.page_source signUpPageData = str(webcrawl(signUpPageSrc, "html.parser")) loginButton = parser.findLoginButton(signUpPageData) driver.find_element_by_class_name(loginButton).click() # Wait for the page to load time.sleep(CONSTANTS.WAIT_TIME) loginPageSrc = driver.page_source loginPageData = str(webcrawl(loginPageSrc, "html.parser")) return loginPageData
def parseQuoraQuestionFeed(filename): # Read the HTML file content = "" with open(filename, "r") as f: content = f.read() ''' Start parsing the document for required entities: We require the following: (i) Question Name (ii) Existing Answers (iii) Number of followers Note: This is a hardcoded version of HTML class attributes and we need to change this to something concrete to make it error-prone so that the code doesn't break if Quora changes its UI or HTML skeleton. For instance, if Quora provides an API to access topic-question-feed then we do not need to follow this approach ''' soup = webcrawl(content, "html.parser") questionList = soup.findAll( "div", {"class": "QuestionFeedStory FeedStory feed_item"}) results = [] for question in questionList: questionName = question.find('span', class_="ui_qtext_rendered_qtext") existingNumAnswers = question.find('a', class_="answer_count_prominent") if existingNumAnswers.text == "No answer yet": existingNumAnswers = 0 else: existingNumAnswers = int(existingNumAnswers.text.split()[0]) followButton = question.find( 'div', class_="ItemComponent FollowActionItem primary_item") if followButton == None: followButton = question.find( 'div', class_="FollowActionItem ItemComponent primary_item") followCountWrapper = followButton.find('span', class_="icon_action_bar-count") followerCount = followCountWrapper.find_all('span')[1].text # Prepare a dictionary and keep appending to the list result = { "questionName": questionName.text, "existingAnswers": existingNumAnswers, "followers": followerCount } results.append(result) return results