def getResultPageInfo(self): sponsor = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Awarded By']/../../following-sibling::div/p").get_attribute('textContent')) awardAmount = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Award Amount']/../../following-sibling::div/p").get_attribute('textContent')) recipients = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Recipients']/../../following-sibling::div/p").get_attribute('textContent')) requirements = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Requirements']/../../following-sibling::div").get_attribute('textContent')) additionalInfo = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Additional Information']/../../following-sibling::div/p").get_attribute( 'textContent')) contact = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Contact']/../../following-sibling::div/p").get_attribute('textContent')) address = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Address']/../../following-sibling::div").get_attribute('textContent')) if self.checkIfElementExists("//a[@class='button secondary']"): sourceWebsite = self.driver.find_element_by_xpath("//a[@class='button secondary']").get_attribute('href') sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite)) else: sourceWebsite = '' sourceText = '' resultPageArray = [sponsor, awardAmount, recipients, requirements, additionalInfo, contact, address, sourceWebsite, sourceText] return resultPageArray
def __init__(self, url): self.url = url self.htmlSource = RipPage.getPageSource(self.url) self.title = '' self.pageurl = '' self.allurlsonpage = [] self.description = ''
def getLeads(self): arrayOfTitleLinkDivs = self.driver.find_elements_by_xpath("//td[@class='scholarshipNameColumn']/div/a") arrayOfAmountDivs = self.driver.find_elements_by_xpath("//td[@class='amount']") arrayOfDeadlineDivs = self.driver.find_elements_by_xpath("//td[@class='deadline']") titlesList = self.getTitlesList(arrayOfTitleLinkDivs) linksList = self.getLinksList(arrayOfTitleLinkDivs) amountsList = self.getAmountsList(arrayOfAmountDivs) deadlinesList = self.getDeadlinesList(arrayOfDeadlineDivs) for i in range(len(titlesList)): title = CleanText.cleanALLtheText(titlesList[i]) resultPageLink = linksList[i] amount = CleanText.cleanALLtheText(amountsList[i]) deadline = deadlinesList[i] resultPageInfo = self.goToResultPageAndPullInformation(resultPageLink) sponsor = CleanText.cleanALLtheText(resultPageInfo[0]) sourceWebsite = resultPageInfo[1] description = CleanText.cleanALLtheText(resultPageInfo[2]) requirements = CleanText.cleanALLtheText(resultPageInfo[3]) sourceText = '' if re.search('^https?://', sourceWebsite): sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite)) leadArray = [title, amount, deadline, sponsor, description, requirements, resultPageLink, sourceWebsite, sourceText] self.collegeGreenLightLeadsArrays.append(leadArray) self.driver.quit() return self.collegeGreenLightLeadsArrays
def makeLeadArrayAndAddToGrantForwardLeads(self, singleResultArray): name = CleanText.cleanALLtheText(singleResultArray[0]) url = singleResultArray[1] resultPageInfo = self.goToResultPageAndPullInformation(url) keyword = CleanText.cleanALLtheText(self.searchTerm) abstract = CleanText.cleanALLtheText(resultPageInfo[6]) sponsor = CleanText.cleanALLtheText(resultPageInfo[1]) amount = CleanText.cleanALLtheText(resultPageInfo[2]) applicantType = CleanText.cleanALLtheText(resultPageInfo[3]) citizenshipResidency = CleanText.cleanALLtheText(resultPageInfo[4]) activityLocation = CleanText.cleanALLtheText(resultPageInfo[5]) eligibility = CleanText.cleanALLtheText(resultPageInfo[7]) categories = CleanText.cleanALLtheText(resultPageInfo[8]) sourceWebsite = resultPageInfo[0] sourceText = CleanText.cleanALLtheText( RipPage.getPageSource(sourceWebsite)) singleLeadArray = [ keyword, url, name, abstract, sponsor, amount, applicantType, citizenshipResidency, activityLocation, eligibility, categories, sourceWebsite, sourceText ] self.arrayOfPivotLeads.append(singleLeadArray)
def getSourceWebsitesAndSourceTexts(self): sourceWebsiteDivs = self.driver.find_elements_by_xpath("//div[@class='col-xs-8 col-xs-offset-2']/a") sourceWebsitesList = [sourceWebsiteDiv.get_attribute('href') for sourceWebsiteDiv in sourceWebsiteDivs] sourceTextsList = [RipPage.getPageSource(sourceWebsite) for sourceWebsite in sourceWebsitesList] sourceTextsList = [CleanText.cleanALLtheText(sourceText) for sourceText in sourceTextsList] return sourceWebsitesList, sourceTextsList
def goToResultPageAndPullInformation(self, resultPageLink): self.driver.get(resultPageLink) self.driver.implicitly_wait(2) description = '' sponsor = '' amount = '' eligibility = '' submissionInfo = '' categories = '' sourceWebsite = '' sourceText = '' deadline = '' if self.checkIfElementExists("//div[@id = 'field-description']/div[@class = 'content-collapsed']"): description = self.driver.find_element_by_xpath( "//div[@id = 'field-description']/div[@class = 'content-collapsed']").get_attribute('textContent') description = CleanText.cleanALLtheText(description) if self.checkIfElementExists("//div[@class = 'sponsor-content']/div/a"): sponsor = self.driver.find_element_by_xpath("//div[@class = 'sponsor-content']/div/a").get_attribute( 'textContent') sponsor = CleanText.cleanALLtheText(sponsor) if self.checkIfElementExists("//div[@id = 'field-amount_info']/div[@class = 'content-collapsed']"): amount = self.driver.find_element_by_xpath( "//div[@id = 'field-amount_info']/div[@class = 'content-collapsed']").get_attribute('textContent') amount = CleanText.cleanALLtheText(amount) if self.checkIfElementExists("//div[@id = 'field-eligibility']/div[@class = 'content-collapsed']"): eligibility = self.driver.find_element_by_xpath( "//div[@id = 'field-eligibility']/div[@class = 'content-collapsed']").get_attribute('textContent') eligibility = CleanText.cleanALLtheText(eligibility) if self.checkIfElementExists("//div[@id = 'field-submission_info']/div[@class = 'content-collapsed']"): submissionInfo = self.driver.find_element_by_xpath( "//div[@id = 'field-submission_info']/div[@class = 'content-collapsed']").get_attribute('textContent') submissionInfo = CleanText.cleanALLtheText(submissionInfo) if self.checkIfElementExists("//div[@id = 'field-subjects']/ul"): categories = self.driver.find_element_by_xpath("//div[@id = 'field-subjects']/ul").get_attribute( 'textContent') categories = CleanText.cleanALLtheText(categories) if self.checkIfElementExists("//a[@class = 'source-link btn btn-warning']"): sourceWebsite = self.driver.find_element_by_xpath( "//a[@class = 'source-link btn btn-warning']").get_attribute('href') sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite)) if self.checkIfElementExists("//div[@class='table-responsive deadline-tables']/table/tbody"): deadline = self.driver.find_element_by_xpath( "//div[@class='table-responsive deadline-tables']/table/tbody").get_attribute('textContent') deadline = CleanText.cleanALLtheText(deadline) resultPageInfo = [description, sponsor, amount, eligibility, submissionInfo, categories, sourceWebsite, sourceText, deadline] return resultPageInfo
def getLeads(self): titleDivs = self.driver.find_elements_by_xpath("//h3[not(ancestor::div[@id='scholarship_intro_859'])]") for i in range(len(titleDivs)): title = titleDivs[i].get_attribute('textContent') requirements = '' sourceWebsite = '' description = '' if title != 'Quick Links' and title != 'About Us': if i == 0: description = self.driver.find_element_by_xpath("//div[@class='intro']/p").get_attribute( 'textContent') sourceWebsite = self.driver.find_element_by_xpath("//div[@class='intro']/p/a").get_attribute('href') requirements = self.driver.find_element_by_xpath( "//div[@class='intro']/following-sibling::*[1][self::ul]").get_attribute('textContent') else: j = i + 1 if self.checkIfElementExists( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]" % j): description = self.driver.find_element_by_xpath( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]" % j).get_attribute( 'textContent') if self.checkIfElementExists( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]" % j): requirements = self.driver.find_element_by_xpath( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]" % j).get_attribute( 'textContent') if self.checkIfElementExists( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]/a" % j): sourceWebsite = self.driver.find_element_by_xpath( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]/a" % j).get_attribute( 'href') elif self.checkIfElementExists( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]" % j): if self.checkIfElementExists( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]/a" % j): sourceWebsite = self.driver.find_element_by_xpath( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]/a" % j).get_attribute( 'href') sourceText = RipPage.getPageSource(sourceWebsite) title = CleanText.cleanALLtheText(title) description = CleanText.cleanALLtheText(description) requirements = CleanText.cleanALLtheText(requirements) sourceText = CleanText.cleanALLtheText(sourceText) leadArray = [title, description, requirements, sourceWebsite, sourceText] self.teacherDotOrgLeadArrays.append(leadArray) self.driver.close() return self.teacherDotOrgLeadArrays
def getResultPageInfo(self): url = self.driver.current_url sponsor = '' awardAmount = '' recipients = '' requirements = '' additionalInfo = '' contact = '' address = '' deadlineInformation = '' if self.checkIfElementExists("//div/p/strong[text() = 'Awarded By']/../../following-sibling::div/p"): sponsor = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Awarded By']/../../following-sibling::div/p").get_attribute( 'textContent')) sponsor = re.sub('» More Info', '', sponsor) if self.checkIfElementExists("//div/p/strong[text() = 'Award Amount']/../../following-sibling::div/p"): awardAmount = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Award Amount']/../../following-sibling::div/p").get_attribute( 'textContent')) if self.checkIfElementExists("//div/p/strong[text() = 'Recipients']/../../following-sibling::div/p"): recipients = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Recipients']/../../following-sibling::div/p").get_attribute( 'textContent')) if self.checkIfElementExists("//div/p/strong[text() = 'Requirements']/../../following-sibling::div"): requirements = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Requirements']/../../following-sibling::div").get_attribute( 'textContent')) if self.checkIfElementExists( "//div/p/strong[text() = 'Additional Information']/../../following-sibling::div/p"): additionalInfo = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Additional Information']/../../following-sibling::div/p").get_attribute( 'textContent')) if self.checkIfElementExists("//div/p/strong[text() = 'Contact']/../../following-sibling::div/p"): contact = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Contact']/../../following-sibling::div/p").get_attribute('textContent')) if self.checkIfElementExists("//div/p/strong[text() = 'Address']/../../following-sibling::div"): address = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//div/p/strong[text() = 'Address']/../../following-sibling::div").get_attribute('textContent')) if self.checkIfElementExists( "//strong[text() ='Deadline Information']/following-sibling::span[@class='smalltext']"): deadlineInformation = CleanText.cleanALLtheText(self.driver.find_element_by_xpath( "//strong[text() ='Deadline Information']/following-sibling::span[@class='smalltext']").get_attribute( 'textContent')) if self.checkIfElementExists("//a[@class='button cta']"): sourceWebsite = self.driver.find_element_by_xpath("//a[@class='button cta']").get_attribute('href') sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite)) else: sourceWebsite = '' sourceText = '' resultPageArray = [url, sponsor, awardAmount, recipients, requirements, additionalInfo, contact, address, deadlineInformation, sourceWebsite, sourceText] return resultPageArray
def makeLeadArray(self, resultArray): title = resultArray[0] amount = resultArray[1] deadline = resultArray[2] description = resultArray[3] sourceWebsite = resultArray[4] sourceText = RipPage.getPageSource(sourceWebsite) sourceText = CleanText.cleanALLtheText(sourceText) mastersInEducationLeadArray = [title, amount, deadline, description, sourceWebsite, sourceText] self.mastersInEducationLeadsArrays.append(mastersInEducationLeadArray)
def makeLeadArray(self, resultArray): title = resultArray[0] amount = resultArray[1] deadline = resultArray[2] description = resultArray[3] sourceWebsite = resultArray[4] sourceText = RipPage.getPageSource(sourceWebsite) sourceText = CleanText.cleanALLtheText(sourceText) mastersInEducationLeadArray = [ title, amount, deadline, description, sourceWebsite, sourceText ] self.mastersInEducationLeadsArrays.append(mastersInEducationLeadArray)
def __init__(self): self.db = SUDBConnect() self.linkUrlsList = [] rowsWithEmptyLinkBody = self.db.getRowsDB("select * from dbo.LinkCrawlerHrefs where ISNULL(LinkBody, '') = ''") if len(rowsWithEmptyLinkBody) >= 1: for row in rowsWithEmptyLinkBody: self.linkUrlsList.append(row.LinkUrl) if len(self.linkUrlsList) >= 1: for link in self.linkUrlsList: linkbody = RipPage.getPageSource(link) cleanLinkBody = CleanText.cleanALLtheText(linkbody) self.db.insertUpdateOrDeleteDB( "UPDATE dbo.LinkCrawlerHrefs SET LinkBody='" + cleanLinkBody + "' WHERE LinkUrl='" + link + "'")
def __init__(self): self.db = SUDBConnect() self.listOfEmptyLinkBodyLinks = [] rowsWithEmptyLinkBody = self.db.getRowsDB("select * from dbo.GoogleLeads where ISNULL(LinkBody, '') = ''") if len(rowsWithEmptyLinkBody) >= 1: for row in rowsWithEmptyLinkBody: self.listOfEmptyLinkBodyLinks.append(row.Link) if len(self.listOfEmptyLinkBodyLinks) >= 1: for link in self.listOfEmptyLinkBodyLinks: linkbody = RipPage.getPageSource(link) linkbody = CleanText.cleanALLtheText(linkbody) self.db.insertUpdateOrDeleteDB( "update dbo.GoogleLeads set LinkBody='" + linkbody + "', DateBodyGenerated=GETDATE() where Link='" + link + "'")
def getInfoFromScholarshipPage(self, url): self.driver.get(url) self.driver.implicitly_wait(2) description = '' awardType = '' numAwards = '' majors = '' additionalInfo = '' sourceWebsite = '' sourceText = '' if self.checkIfElementExists("//div[@class='description']"): description = self.driver.find_element_by_xpath("//div[@class='description']").get_attribute('textContent') description = CleanText.cleanALLtheText(description) if self.checkIfElementExists("//p[text() = 'Award Type: ']/following-sibling::p[@class='data']"): awardType = self.driver.find_element_by_xpath( "//p[text() = 'Award Type: ']/following-sibling::p[@class='data']").get_attribute('textContent') awardType = CleanText.cleanALLtheText(awardType) if self.checkIfElementExists("//p[text() = 'Awards Available: ']/following-sibling::p[@class='data']"): numAwards = self.driver.find_element_by_xpath( "//p[text() = 'Awards Available: ']/following-sibling::p[@class='data']").get_attribute('textContent') numAwards = CleanText.cleanALLtheText(numAwards) if self.checkIfElementExists("//p[text() = 'Fields of Study: ']/following-sibling::p[@class='data major']"): majors = self.driver.find_element_by_xpath( "//p[text() = 'Fields of Study: ']/following-sibling::p[@class='data major']").get_attribute( 'textContent') majors = re.sub('All Fields of Study', '', majors) majors = CleanText.cleanALLtheText(majors) if self.checkIfElementExists("//p[text() = 'Additional Info: ']/following-sibling::p[@class='data major']"): additionalInfo = self.driver.find_element_by_xpath( "//p[text() = 'Additional Info: ']/following-sibling::p[@class='data major']").get_attribute( 'textContent') additionalInfo = CleanText.cleanALLtheText(additionalInfo) if self.checkIfElementExists("//p[text() = 'Website: ']/following-sibling::p[@class='data']/a"): sourceWebsite = self.driver.find_element_by_xpath( "//p[text() = 'Website: ']/following-sibling::p[@class='data']/a").get_attribute('href') sourceText = RipPage.getPageSource(sourceWebsite) sourceText = CleanText.cleanALLtheText(sourceText) scholarshipPageInfoArray = [description, awardType, numAwards, majors, additionalInfo, sourceWebsite, sourceText] return scholarshipPageInfoArray
def getInfoFromScholarshipPage(self, link): self.driver.get(link) self.driver.implicitly_wait(2) findBadButton = self.driver.find_elements_by_xpath( "//button[@class='btn-primary-sm save-profile chgsec_hostedsc-apply-ApplyNowButton chgser_sc']") if findBadButton == []: eligibility = '' applicationOverview = '' description = '' sponsor = '' sourceWebsite = '' sourceText = '' if self.checkIfElementExists("//span[@class='txt-3']"): eligibility = self.driver.find_element_by_xpath("//span[@class='txt-3']").get_attribute('textContent') if self.checkIfElementExists( "//h3[text() = 'Application Overview']/following-sibling::div[@class='txt-3']"): applicationOverview = self.driver.find_element_by_xpath( "//h3[text() = 'Application Overview']/following-sibling::div[@class='txt-3']").get_attribute( 'textContent') if self.checkIfElementExists("//h3[text() = 'Purpose']/following-sibling::div[@class='txt-3']"): description = self.driver.find_element_by_xpath( "//h3[text() = 'Purpose']/following-sibling::div[@class='txt-3']").get_attribute('textContent') if self.checkIfElementExists( "//h3[text() = 'Provider Organization']/following-sibling::div[@class='txt-3'][1]"): sponsor = self.driver.find_element_by_xpath( "//h3[text() = 'Provider Organization']/following-sibling::div[@class='txt-3'][1]").get_attribute( 'textContent') if self.checkIfElementExists("//button[@class='btn-primary-sm go-apply']"): sourceWebsite = self.driver.find_element_by_xpath( "//button[@class='btn-primary-sm go-apply']").get_attribute('url') sourceText = RipPage.getPageSource(sourceWebsite) resultPageInfoArray = [eligibility, applicationOverview, description, sponsor, sourceText] resultPageInfoArray = [CleanText.cleanALLtheText(item) for item in resultPageInfoArray] resultPageInfoArray.append(sourceWebsite) return resultPageInfoArray else: return None
def getInfoFromScholarshipPage(self, url): self.driver.get(url) self.driver.implicitly_wait(2) description = '' eligibility = '' amountInfo = '' deadlineInfo = '' sourceWebsite = '' sourceText = '' if self.checkIfElementExists("//div[@class='entry-content']/p[1]"): description = self.driver.find_element_by_xpath("//div[@class='entry-content']/p[1]").get_attribute( 'textContent') description = CleanText.cleanALLtheText(description) if self.checkIfElementExists( "//div[@class='entry-content']/p/strong[text() = 'Who is eligible to apply?']/../following-sibling::ul[1]"): eligibility = self.driver.find_element_by_xpath( "//div[@class='entry-content']/p/strong[text() = 'Who is eligible to apply?']/../following-sibling::ul[1]").get_attribute( 'textContent') eligibility = CleanText.cleanALLtheText(eligibility) if self.checkIfElementExists( "//div[@class='entry-content']/p/strong[text() = 'How much is each scholarship worth?']/../following-sibling::p[1]"): amountInfo = self.driver.find_element_by_xpath( "//div[@class='entry-content']/p/strong[text() = 'How much is each scholarship worth?']/../following-sibling::p[1]").get_attribute( 'textContent') amountInfo = CleanText.cleanALLtheText(amountInfo) if self.checkIfElementExists( "//div[@class='entry-content']/p/strong[text() = 'When is the deadline to apply?']/../following-sibling::ul[1]"): deadlineInfo = self.driver.find_element_by_xpath( "//div[@class='entry-content']/p/strong[text() = 'When is the deadline to apply?']/../following-sibling::ul[1]").get_attribute( 'textContent') deadlineInfo = CleanText.cleanALLtheText(deadlineInfo) if self.checkIfElementExists("//span[@class='apply']/a"): sourceWebsite = self.driver.find_element_by_xpath("//span[@class='apply']/a").get_attribute('href') sourceText = RipPage.getPageSource(sourceWebsite) sourceText = CleanText.cleanALLtheText(sourceText) scholarshipPageInfoArray = [description, eligibility, amountInfo, deadlineInfo, sourceWebsite, sourceText] return scholarshipPageInfoArray
def getLeads(self): arrayOfTitleLinkDivs = self.driver.find_elements_by_xpath( "//td[@class='f']/../preceding-sibling::tr[1]/td[@class='a']/a") arrayOfDateDescriptionDivs = self.driver.find_elements_by_xpath("//tr/td[@class='f']/../td") titlesList = self.getTitlesList(arrayOfTitleLinkDivs) linksList = self.getLinksList(arrayOfTitleLinkDivs) dueDatesList = self.getDueDates(arrayOfDateDescriptionDivs) descriptionsList = self.getDescriptionsList(arrayOfDateDescriptionDivs) for i in range(len(titlesList)): title = CleanText.cleanALLtheText(titlesList[i]) link = linksList[i] dueDate = dueDatesList[i] description = CleanText.cleanALLtheText(descriptionsList[i]) sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(link)) scholarshipArray = [title, description, dueDate, link, sourceText] self.fatomeiLeadsArray.append(scholarshipArray)
def getSourceWebsitesAndSourceTexts(self): sourceWebsiteDivs = self.driver.find_elements_by_xpath( "//div[@class='col-xs-8 col-xs-offset-2']/a") sourceWebsitesList = [ sourceWebsiteDiv.get_attribute('href') for sourceWebsiteDiv in sourceWebsiteDivs ] sourceTextsList = [ RipPage.getPageSource(sourceWebsite) for sourceWebsite in sourceWebsitesList ] sourceTextsList = [ CleanText.cleanALLtheText(sourceText) for sourceText in sourceTextsList ] return sourceWebsitesList, sourceTextsList
def getLeads(self): arrayOfTitleLinkDivs = self.driver.find_elements_by_xpath( "//td[@class='f']/../preceding-sibling::tr[1]/td[@class='a']/a") arrayOfDateDescriptionDivs = self.driver.find_elements_by_xpath( "//tr/td[@class='f']/../td") titlesList = self.getTitlesList(arrayOfTitleLinkDivs) linksList = self.getLinksList(arrayOfTitleLinkDivs) dueDatesList = self.getDueDates(arrayOfDateDescriptionDivs) descriptionsList = self.getDescriptionsList(arrayOfDateDescriptionDivs) for i in range(len(titlesList)): title = CleanText.cleanALLtheText(titlesList[i]) link = linksList[i] dueDate = dueDatesList[i] description = CleanText.cleanALLtheText(descriptionsList[i]) sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(link)) scholarshipArray = [title, description, dueDate, link, sourceText] self.fatomeiLeadsArray.append(scholarshipArray)
def makeLeadArrayAndAddToGrantForwardLeads(self, singleResultArray): name = CleanText.cleanALLtheText(singleResultArray[0]) url = singleResultArray[1] resultPageInfo = self.goToResultPageAndPullInformation(url) keyword = CleanText.cleanALLtheText(self.searchTerm) abstract = CleanText.cleanALLtheText(resultPageInfo[6]) sponsor = CleanText.cleanALLtheText(resultPageInfo[1]) amount = CleanText.cleanALLtheText(resultPageInfo[2]) applicantType = CleanText.cleanALLtheText(resultPageInfo[3]) citizenshipResidency = CleanText.cleanALLtheText(resultPageInfo[4]) activityLocation = CleanText.cleanALLtheText(resultPageInfo[5]) eligibility = CleanText.cleanALLtheText(resultPageInfo[7]) categories = CleanText.cleanALLtheText(resultPageInfo[8]) sourceWebsite = resultPageInfo[0] sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite)) singleLeadArray = [keyword, url, name, abstract, sponsor, amount, applicantType, citizenshipResidency, activityLocation, eligibility, categories, sourceWebsite, sourceText] self.arrayOfPivotLeads.append(singleLeadArray)
def getLeads(self): arrayOfTitleLinkDivs = self.driver.find_elements_by_xpath( "//td[@class='scholarshipNameColumn']/div/a") arrayOfAmountDivs = self.driver.find_elements_by_xpath( "//td[@class='amount']") arrayOfDeadlineDivs = self.driver.find_elements_by_xpath( "//td[@class='deadline']") titlesList = self.getTitlesList(arrayOfTitleLinkDivs) linksList = self.getLinksList(arrayOfTitleLinkDivs) amountsList = self.getAmountsList(arrayOfAmountDivs) deadlinesList = self.getDeadlinesList(arrayOfDeadlineDivs) for i in range(len(titlesList)): title = CleanText.cleanALLtheText(titlesList[i]) resultPageLink = linksList[i] amount = CleanText.cleanALLtheText(amountsList[i]) deadline = deadlinesList[i] resultPageInfo = self.goToResultPageAndPullInformation( resultPageLink) sponsor = CleanText.cleanALLtheText(resultPageInfo[0]) sourceWebsite = resultPageInfo[1] description = CleanText.cleanALLtheText(resultPageInfo[2]) requirements = CleanText.cleanALLtheText(resultPageInfo[3]) sourceText = '' if re.search('^https?://', sourceWebsite): sourceText = CleanText.cleanALLtheText( RipPage.getPageSource(sourceWebsite)) leadArray = [ title, amount, deadline, sponsor, description, requirements, resultPageLink, sourceWebsite, sourceText ] self.collegeGreenLightLeadsArrays.append(leadArray) self.driver.quit() return self.collegeGreenLightLeadsArrays
def goToResultsPageAndGetInfo(self, resultPageLink): self.driver.get(resultPageLink) self.driver.implicitly_wait(2) self.makeSureLoggedIn(resultPageLink) sponsor = '' submissionDeadline = '' majors = '' awardAmount = '' description = '' otherCriteria = '' numberAwards = '' hostInstitution = '' awardIncludes = '' nationalityRequired = '' hostCountries = '' sourceWebsite = '' sourceText = '' if self.checkIfElementExists("//span[@class='award-sponsor']"): sponsor = self.driver.find_element_by_xpath("//span[@class='award-sponsor']").get_attribute('textContent') sponsor = re.sub('^Sponsor:', '', sponsor) if self.checkIfElementExists("//h4[text() = 'Submission Deadline']/following-sibling::p"): submissionDeadline = self.driver.find_element_by_xpath( "//h4[text() = 'Submission Deadline']/following-sibling::p").get_attribute('textContent') if self.checkIfElementExists("//p[@id='award-fieldofstudy']"): majors = self.driver.find_element_by_xpath("//p[@id='award-fieldofstudy']").get_attribute('textContent') majors = re.sub('Unrestricted', '', majors) if self.checkIfElementExists("//p[@id='award-amount']"): awardAmount = self.driver.find_element_by_xpath("//p[@id='award-amount']").get_attribute('textContent') if self.checkIfElementExists( "//div[@class='award-description padding_bottom_30']/h4[text() = 'Description']/following-sibling::p[1]"): description = self.driver.find_element_by_xpath( "//div[@class='award-description padding_bottom_30']/h4[text() = 'Description']/following-sibling::p[1]").get_attribute( 'textContent') if self.checkIfElementExists( "//div[@class='award-description padding_bottom_30']/h4[text() = 'Other Criteria']/following-sibling::p[1]"): otherCriteria = self.driver.find_element_by_xpath( "//div[@class='award-description padding_bottom_30']/h4[text() = 'Other Criteria']/following-sibling::p[1]").get_attribute( 'textContent') if self.checkIfElementExists("//th[text() = 'Number of Awards']/following-sibling::td"): numberAwards = self.driver.find_element_by_xpath( "//th[text() = 'Number of Awards']/following-sibling::td").get_attribute('textContent') if self.checkIfElementExists("//th[text() = 'Host Institution']/following-sibling::td"): hostInstitution = self.driver.find_element_by_xpath( "//th[text() = 'Host Institution']/following-sibling::td").get_attribute('textContent') if self.checkIfElementExists("//th[text() = 'Includes']/following-sibling::td"): awardIncludes = self.driver.find_element_by_xpath( "//th[text() = 'Includes']/following-sibling::td").get_attribute('textContent') if self.checkIfElementExists("//th[text() = 'Nationality Required']/following-sibling::td"): nationalityRequired = self.driver.find_element_by_xpath( "//th[text() = 'Nationality Required']/following-sibling::td").get_attribute('textContent') nationalityRequired = re.sub('Unrestricted', '', nationalityRequired) if self.checkIfElementExists("//th[text() = 'Host Countries']/following-sibling::td"): hostCountries = self.driver.find_element_by_xpath( "//th[text() = 'Host Countries']/following-sibling::td").get_attribute('textContent') hostCountries = re.sub('Unrestricted', '', hostCountries) if self.checkIfElementExists("//th[text() = 'Link']/following-sibling::td/a"): sourceWebsite = self.driver.find_element_by_xpath( "//th[text() = 'Link']/following-sibling::td/a").get_attribute('href') sourceText = RipPage.getPageSource(sourceWebsite) resultPageInfoArray = [sponsor, submissionDeadline, majors, awardAmount, description, otherCriteria, numberAwards, hostInstitution, awardIncludes, nationalityRequired, hostCountries, sourceText] resultPageInfoArray = [CleanText.cleanALLtheText(item) for item in resultPageInfoArray] resultPageInfoArray.append(sourceWebsite) return resultPageInfoArray
#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs from Classes.RipPage import RipPage from Classes.CleanText import CleanText # rippedpage = RipPage.getPageSource('http://webapps.acs.org/findawards/detail.jsp?ContentId=CTP_004520') ripped = RipPage.getPageSource('http://webapps.acs.org/findawards/detail.jsp?ContentId=CNBP_031057') arr = bytearray(ripped, "utf-8") print(str(arr)) ripped2=str(arr) print(ripped2) print(CleanText.cleanALLtheText(ripped2))
from Classes.RipPage import RipPage RipPage.getPageSource('http://engineering.berkeley.edu/') cheese = 'cheese' crackers = 'crackers' cheesecrackers = 'cheese' + crackers print(cheesecrackers)
def goToResultPageAndPullInformation(self, resultPageLink): self.driver.get(resultPageLink) self.driver.implicitly_wait(2) description = '' sponsor = '' amount = '' eligibility = '' submissionInfo = '' categories = '' sourceWebsite = '' sourceText = '' deadline = '' if self.checkIfElementExists( "//div[@id = 'field-description']/div[@class = 'content-collapsed']" ): description = self.driver.find_element_by_xpath( "//div[@id = 'field-description']/div[@class = 'content-collapsed']" ).get_attribute('textContent') description = CleanText.cleanALLtheText(description) if self.checkIfElementExists( "//div[@class = 'sponsor-content']/div/a"): sponsor = self.driver.find_element_by_xpath( "//div[@class = 'sponsor-content']/div/a").get_attribute( 'textContent') sponsor = CleanText.cleanALLtheText(sponsor) if self.checkIfElementExists( "//div[@id = 'field-amount_info']/div[@class = 'content-collapsed']" ): amount = self.driver.find_element_by_xpath( "//div[@id = 'field-amount_info']/div[@class = 'content-collapsed']" ).get_attribute('textContent') amount = CleanText.cleanALLtheText(amount) if self.checkIfElementExists( "//div[@id = 'field-eligibility']/div[@class = 'content-collapsed']" ): eligibility = self.driver.find_element_by_xpath( "//div[@id = 'field-eligibility']/div[@class = 'content-collapsed']" ).get_attribute('textContent') eligibility = CleanText.cleanALLtheText(eligibility) if self.checkIfElementExists( "//div[@id = 'field-submission_info']/div[@class = 'content-collapsed']" ): submissionInfo = self.driver.find_element_by_xpath( "//div[@id = 'field-submission_info']/div[@class = 'content-collapsed']" ).get_attribute('textContent') submissionInfo = CleanText.cleanALLtheText(submissionInfo) if self.checkIfElementExists("//div[@id = 'field-subjects']/ul"): categories = self.driver.find_element_by_xpath( "//div[@id = 'field-subjects']/ul").get_attribute( 'textContent') categories = CleanText.cleanALLtheText(categories) if self.checkIfElementExists( "//a[@class = 'source-link btn btn-warning']"): sourceWebsite = self.driver.find_element_by_xpath( "//a[@class = 'source-link btn btn-warning']").get_attribute( 'href') sourceText = CleanText.cleanALLtheText( RipPage.getPageSource(sourceWebsite)) if self.checkIfElementExists( "//div[@class='table-responsive deadline-tables']/table/tbody" ): deadline = self.driver.find_element_by_xpath( "//div[@class='table-responsive deadline-tables']/table/tbody" ).get_attribute('textContent') deadline = CleanText.cleanALLtheText(deadline) resultPageInfo = [ description, sponsor, amount, eligibility, submissionInfo, categories, sourceWebsite, sourceText, deadline ] return resultPageInfo
def getInfoFromResultPage(self, resultPageLink): self.driver.get(resultPageLink) self.driver.implicitly_wait(2) description = '' sponsor = '' classStatus = '' major = '' gender = '' ethnicity = '' grades = '' testScores = '' geography = '' deadline = '' essayInfo = '' sourceWebsite = '' sourceText = '' if self.checkIfElementExists("//div[@id='main-column']/p[1]"): description = self.driver.find_element_by_xpath( "//div[@id='main-column']/p[1]").get_attribute('textContent') description = CleanText.cleanALLtheText(description) if self.checkIfElementExists("//div[@id='main-column']/p[2]"): sponsor = self.driver.find_element_by_xpath( "//div[@id='main-column']/p[2]").get_attribute('textContent') sponsor = CleanText.cleanALLtheText(re.sub('Sponsor:', '', sponsor)) if self.checkIfElementExists( "//tr/td[contains(text(), 'School Year')]/following-sibling::td" ): classStatus = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'School Year')]/following-sibling::td" ).get_attribute('textContent') classStatus = CleanText.cleanALLtheText(classStatus) classStatus = re.sub('No Restrictions Listed', '', classStatus) classStatus = re.sub('No Restrictions', '', classStatus) if self.checkIfElementExists( "//tr/td[contains(text(), 'School Year')]/following-sibling::td" ): major = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'School Year')]/following-sibling::td" ).get_attribute('textContent') major = CleanText.cleanALLtheText(major) major = re.sub('No Restrictions Listed', '', major) major = re.sub('No Restrictions', '', major) if self.checkIfElementExists( "//tr/td[contains(text(), 'Gender')]/following-sibling::td"): gender = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'Gender')]/following-sibling::td" ).get_attribute('textContent') gender = CleanText.cleanALLtheText(gender) gender = re.sub('No Restrictions Listed', '', gender) gender = re.sub('No Restrictions', '', gender) if self.checkIfElementExists( "//tr/td[contains(text(), 'Ethnicity')]/following-sibling::td" ): ethnicity = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'Ethnicity')]/following-sibling::td" ).get_attribute('textContent') ethnicity = CleanText.cleanALLtheText(ethnicity) ethnicity = re.sub('No Restrictions Listed', '', ethnicity) ethnicity = re.sub('No Restrictions', '', ethnicity) if self.checkIfElementExists( "//tr/td[contains(text(), 'Grades')]/following-sibling::td"): grades = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'Grades')]/following-sibling::td" ).get_attribute('textContent') grades = CleanText.cleanALLtheText(grades) grades = re.sub('No Restrictions Listed', '', grades) grades = re.sub('No Restrictions', '', grades) if self.checkIfElementExists( "//tr/td[contains(text(), 'Test Scores')]/following-sibling::td" ): testScores = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'Test Scores')]/following-sibling::td" ).get_attribute('textContent') testScores = CleanText.cleanALLtheText(testScores) testScores = re.sub('No Restrictions Listed', '', testScores) testScores = re.sub('No Restrictions', '', testScores) if self.checkIfElementExists( "//tr/td[contains(text(), 'Geography')]/following-sibling::td" ): geography = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'Geography')]/following-sibling::td" ).get_attribute('textContent') geography = CleanText.cleanALLtheText(geography) geography = re.sub('No Restrictions Listed', '', geography) geography = re.sub('No Restrictions', '', geography) if self.checkIfElementExists("//span[@class='deadline data']"): deadline = self.driver.find_element_by_xpath( "//span[@class='deadline data']").get_attribute('textContent') deadline = CleanText.cleanALLtheText( re.sub('\(\.*?\)', '', deadline)) if self.checkIfElementExists( "//div[@class='listing-info']/h3[contains(text(), 'Essay')]/following-sibling::p" ): essayPart1 = self.driver.find_elements_by_xpath( "//div[@class='listing-info']/h3[contains(text(), 'Essay')]/following-sibling::p" ) if self.checkIfElementExists("//div[@id='essay-length']") and len( self.driver.find_elements_by_xpath( "//div[@id='essay-length']")) == len(essayPart1): essayPart2 = self.driver.find_elements_by_xpath( "//div[@id='essay-length']") combinedParts = [] for i in range(len(essayPart1)): part1 = CleanText.cleanALLtheText( essayPart1[i].get_attribute('textContent')) part2 = CleanText.cleanALLtheText( essayPart2[i].get_attribute('textContent')) combined = '%s %s' % (part1, part2) combinedParts.append(combined) essayInfo = ' '.join(combinedParts) else: essayInfo = [ CleanText.cleanALLtheText( essayPart.get_attribute('textContent')) for essayPart in essayPart1 ] essayInfo = ' '.join(essayInfo) if self.checkIfElementExists("//a[@class='action-button visit-site']"): sourceWebsite = self.driver.find_element_by_xpath( "//a[@class='action-button visit-site']").get_attribute('href') sourceText = CleanText.cleanALLtheText( RipPage.getPageSource(sourceWebsite)) resultPageInfoArray = [ description, sponsor, classStatus, major, gender, ethnicity, grades, testScores, geography, deadline, essayInfo, sourceWebsite, sourceText ] return resultPageInfoArray
#!/usr/bin/env python # -*- coding: utf-8 -*- import codecs from Classes.RipPage import RipPage from Classes.CleanText import CleanText # rippedpage = RipPage.getPageSource('http://webapps.acs.org/findawards/detail.jsp?ContentId=CTP_004520') ripped = RipPage.getPageSource( 'http://webapps.acs.org/findawards/detail.jsp?ContentId=CNBP_031057') arr = bytearray(ripped, "utf-8") print(str(arr)) ripped2 = str(arr) print(ripped2) print(CleanText.cleanALLtheText(ripped2))
def getLeads(self): titleDivs = self.driver.find_elements_by_xpath( "//h3[not(ancestor::div[@id='scholarship_intro_859'])]") for i in range(len(titleDivs)): title = titleDivs[i].get_attribute('textContent') requirements = '' sourceWebsite = '' description = '' if title != 'Quick Links' and title != 'About Us': if i == 0: description = self.driver.find_element_by_xpath( "//div[@class='intro']/p").get_attribute('textContent') sourceWebsite = self.driver.find_element_by_xpath( "//div[@class='intro']/p/a").get_attribute('href') requirements = self.driver.find_element_by_xpath( "//div[@class='intro']/following-sibling::*[1][self::ul]" ).get_attribute('textContent') else: j = i + 1 if self.checkIfElementExists( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]" % j): description = self.driver.find_element_by_xpath( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]" % j).get_attribute('textContent') if self.checkIfElementExists( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]" % j): requirements = self.driver.find_element_by_xpath( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]" % j).get_attribute('textContent') if self.checkIfElementExists( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]/a" % j): sourceWebsite = self.driver.find_element_by_xpath( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[1]/a" % j).get_attribute('href') elif self.checkIfElementExists( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]" % j): if self.checkIfElementExists( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]/a" % j): sourceWebsite = self.driver.find_element_by_xpath( "//h3[not(ancestor::div[@id='scholarship_intro_859'])][%s]/following-sibling::p[2][(preceding-sibling::*[1][self::p])]/a" % j).get_attribute('href') sourceText = RipPage.getPageSource(sourceWebsite) title = CleanText.cleanALLtheText(title) description = CleanText.cleanALLtheText(description) requirements = CleanText.cleanALLtheText(requirements) sourceText = CleanText.cleanALLtheText(sourceText) leadArray = [ title, description, requirements, sourceWebsite, sourceText ] self.teacherDotOrgLeadArrays.append(leadArray) self.driver.close() return self.teacherDotOrgLeadArrays
def test_RipPage(self): test_rippage = RipPage.getPageSource('https://www.google.com/') self.assertIsNotNone(test_rippage) self.assertGreater(len(test_rippage), 10)
def getInfoFromResultPage(self, resultPageLink): self.driver.get(resultPageLink) self.driver.implicitly_wait(2) description = '' sponsor = '' classStatus = '' major = '' gender = '' ethnicity = '' grades = '' testScores = '' geography = '' deadline = '' essayInfo = '' sourceWebsite = '' sourceText = '' if self.checkIfElementExists("//div[@id='main-column']/p[1]"): description = self.driver.find_element_by_xpath("//div[@id='main-column']/p[1]").get_attribute( 'textContent') description = CleanText.cleanALLtheText(description) if self.checkIfElementExists("//div[@id='main-column']/p[2]"): sponsor = self.driver.find_element_by_xpath("//div[@id='main-column']/p[2]").get_attribute('textContent') sponsor = CleanText.cleanALLtheText(re.sub('Sponsor:', '', sponsor)) if self.checkIfElementExists("//tr/td[contains(text(), 'School Year')]/following-sibling::td"): classStatus = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'School Year')]/following-sibling::td").get_attribute('textContent') classStatus = CleanText.cleanALLtheText(classStatus) classStatus = re.sub('No Restrictions Listed', '', classStatus) classStatus = re.sub('No Restrictions', '', classStatus) if self.checkIfElementExists("//tr/td[contains(text(), 'School Year')]/following-sibling::td"): major = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'School Year')]/following-sibling::td").get_attribute('textContent') major = CleanText.cleanALLtheText(major) major = re.sub('No Restrictions Listed', '', major) major = re.sub('No Restrictions', '', major) if self.checkIfElementExists("//tr/td[contains(text(), 'Gender')]/following-sibling::td"): gender = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'Gender')]/following-sibling::td").get_attribute('textContent') gender = CleanText.cleanALLtheText(gender) gender = re.sub('No Restrictions Listed', '', gender) gender = re.sub('No Restrictions', '', gender) if self.checkIfElementExists("//tr/td[contains(text(), 'Ethnicity')]/following-sibling::td"): ethnicity = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'Ethnicity')]/following-sibling::td").get_attribute('textContent') ethnicity = CleanText.cleanALLtheText(ethnicity) ethnicity = re.sub('No Restrictions Listed', '', ethnicity) ethnicity = re.sub('No Restrictions', '', ethnicity) if self.checkIfElementExists("//tr/td[contains(text(), 'Grades')]/following-sibling::td"): grades = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'Grades')]/following-sibling::td").get_attribute('textContent') grades = CleanText.cleanALLtheText(grades) grades = re.sub('No Restrictions Listed', '', grades) grades = re.sub('No Restrictions', '', grades) if self.checkIfElementExists("//tr/td[contains(text(), 'Test Scores')]/following-sibling::td"): testScores = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'Test Scores')]/following-sibling::td").get_attribute('textContent') testScores = CleanText.cleanALLtheText(testScores) testScores = re.sub('No Restrictions Listed', '', testScores) testScores = re.sub('No Restrictions', '', testScores) if self.checkIfElementExists("//tr/td[contains(text(), 'Geography')]/following-sibling::td"): geography = self.driver.find_element_by_xpath( "//tr/td[contains(text(), 'Geography')]/following-sibling::td").get_attribute('textContent') geography = CleanText.cleanALLtheText(geography) geography = re.sub('No Restrictions Listed', '', geography) geography = re.sub('No Restrictions', '', geography) if self.checkIfElementExists("//span[@class='deadline data']"): deadline = self.driver.find_element_by_xpath("//span[@class='deadline data']").get_attribute('textContent') deadline = CleanText.cleanALLtheText(re.sub('\(\.*?\)', '', deadline)) if self.checkIfElementExists("//div[@class='listing-info']/h3[contains(text(), 'Essay')]/following-sibling::p"): essayPart1 = self.driver.find_elements_by_xpath( "//div[@class='listing-info']/h3[contains(text(), 'Essay')]/following-sibling::p") if self.checkIfElementExists("//div[@id='essay-length']") and len( self.driver.find_elements_by_xpath("//div[@id='essay-length']")) == len(essayPart1): essayPart2 = self.driver.find_elements_by_xpath("//div[@id='essay-length']") combinedParts = [] for i in range(len(essayPart1)): part1 = CleanText.cleanALLtheText(essayPart1[i].get_attribute('textContent')) part2 = CleanText.cleanALLtheText(essayPart2[i].get_attribute('textContent')) combined = '%s %s' % (part1, part2) combinedParts.append(combined) essayInfo = ' '.join(combinedParts) else: essayInfo = [CleanText.cleanALLtheText(essayPart.get_attribute('textContent')) for essayPart in essayPart1] essayInfo = ' '.join(essayInfo) if self.checkIfElementExists("//a[@class='action-button visit-site']"): sourceWebsite = self.driver.find_element_by_xpath("//a[@class='action-button visit-site']").get_attribute( 'href') sourceText = CleanText.cleanALLtheText(RipPage.getPageSource(sourceWebsite)) resultPageInfoArray = [description, sponsor, classStatus, major, gender, ethnicity, grades, testScores, geography, deadline, essayInfo, sourceWebsite, sourceText] return resultPageInfoArray
def goToResultsPageAndGetInfo(self, resultPageLink): self.driver.get(resultPageLink) self.driver.implicitly_wait(2) self.makeSureLoggedIn(resultPageLink) sponsor = '' submissionDeadline = '' majors = '' awardAmount = '' description = '' otherCriteria = '' numberAwards = '' hostInstitution = '' awardIncludes = '' nationalityRequired = '' hostCountries = '' sourceWebsite = '' sourceText = '' if self.checkIfElementExists("//span[@class='award-sponsor']"): sponsor = self.driver.find_element_by_xpath( "//span[@class='award-sponsor']").get_attribute('textContent') sponsor = re.sub('^Sponsor:', '', sponsor) if self.checkIfElementExists( "//h4[text() = 'Submission Deadline']/following-sibling::p"): submissionDeadline = self.driver.find_element_by_xpath( "//h4[text() = 'Submission Deadline']/following-sibling::p" ).get_attribute('textContent') if self.checkIfElementExists("//p[@id='award-fieldofstudy']"): majors = self.driver.find_element_by_xpath( "//p[@id='award-fieldofstudy']").get_attribute('textContent') majors = re.sub('Unrestricted', '', majors) if self.checkIfElementExists("//p[@id='award-amount']"): awardAmount = self.driver.find_element_by_xpath( "//p[@id='award-amount']").get_attribute('textContent') if self.checkIfElementExists( "//div[@class='award-description padding_bottom_30']/h4[text() = 'Description']/following-sibling::p[1]" ): description = self.driver.find_element_by_xpath( "//div[@class='award-description padding_bottom_30']/h4[text() = 'Description']/following-sibling::p[1]" ).get_attribute('textContent') if self.checkIfElementExists( "//div[@class='award-description padding_bottom_30']/h4[text() = 'Other Criteria']/following-sibling::p[1]" ): otherCriteria = self.driver.find_element_by_xpath( "//div[@class='award-description padding_bottom_30']/h4[text() = 'Other Criteria']/following-sibling::p[1]" ).get_attribute('textContent') if self.checkIfElementExists( "//th[text() = 'Number of Awards']/following-sibling::td"): numberAwards = self.driver.find_element_by_xpath( "//th[text() = 'Number of Awards']/following-sibling::td" ).get_attribute('textContent') if self.checkIfElementExists( "//th[text() = 'Host Institution']/following-sibling::td"): hostInstitution = self.driver.find_element_by_xpath( "//th[text() = 'Host Institution']/following-sibling::td" ).get_attribute('textContent') if self.checkIfElementExists( "//th[text() = 'Includes']/following-sibling::td"): awardIncludes = self.driver.find_element_by_xpath( "//th[text() = 'Includes']/following-sibling::td" ).get_attribute('textContent') if self.checkIfElementExists( "//th[text() = 'Nationality Required']/following-sibling::td"): nationalityRequired = self.driver.find_element_by_xpath( "//th[text() = 'Nationality Required']/following-sibling::td" ).get_attribute('textContent') nationalityRequired = re.sub('Unrestricted', '', nationalityRequired) if self.checkIfElementExists( "//th[text() = 'Host Countries']/following-sibling::td"): hostCountries = self.driver.find_element_by_xpath( "//th[text() = 'Host Countries']/following-sibling::td" ).get_attribute('textContent') hostCountries = re.sub('Unrestricted', '', hostCountries) if self.checkIfElementExists( "//th[text() = 'Link']/following-sibling::td/a"): sourceWebsite = self.driver.find_element_by_xpath( "//th[text() = 'Link']/following-sibling::td/a").get_attribute( 'href') sourceText = RipPage.getPageSource(sourceWebsite) resultPageInfoArray = [ sponsor, submissionDeadline, majors, awardAmount, description, otherCriteria, numberAwards, hostInstitution, awardIncludes, nationalityRequired, hostCountries, sourceText ] resultPageInfoArray = [ CleanText.cleanALLtheText(item) for item in resultPageInfoArray ] resultPageInfoArray.append(sourceWebsite) return resultPageInfoArray