def main(): b = session.get(url,headers=hed) data = [] nomor = -1 sop = s(b.content, 'html.parser') for x in sop("a", class_="search-result"): nomor += 1 data.append([x.text, x['href']]) print(str([nomor]),x.text) inp = int(input('[√] Pilih No provinsi : ')) v = data[inp][1] z = r.get("https://www.accuweather.com/"+v,headers=hed) daa = [] nomor = -1 sp = s(z.content, 'html.parser') for xe in sp("a", class_="search-result"): nomor += 1 daa.append([xe.content, xe['href']]) print(str([nomor]),xe.text) inp = int(input('[√] Pilih No Daerah : ')) m = daa[inp][1] h = r.get("https://www.accuweather.com/"+m, headers=hed) sg = s(h.text, 'html.parser') nomor = 0 for xc in sg("div",class_="card weather-card content-module non-ad"): nomor = 1 x = xc.text z = x.strip() print(str(nomor),z) it = input("Tekan Enter untuk Kembali Ke menu...") main()
def getProducts(url): #print(url) request = requests.get(url) soup = s(request.content,'html.parser') li = soup.find_all('li') items = [i for i in li if 'class' in i.attrs.keys() and i['class']==['item', 'last']] for i in items: #print(i) #Adding only the unique set of product name if re.sub('- \w+',"",re.sub("\(.*\)","",i.a['title']).strip()).strip().lower() not in plist: pUrl.append(i.a['href']) pName.append(re.sub('- \w+',"",re.sub("\(.*\)","",i.a['title']).strip()).strip()) plist.append(re.sub('- \w+',"",re.sub("\(.*\)","",i.a['title']).strip()).strip().lower()) #print(i.a['title']) pIURL.append(i.img['src']) p = i.find_all('p') #print(p) para = [j for j in p if 'class' in j.attrs.keys() and j['class']==['special-price']] if len(para)==0: p = i.find_all('div') para = [j for j in p if 'class' in j.attrs.keys() and j['class']==['price-box']] #print(para) price = [j for j in para[0].text.split('\n') if '$' in j] pPrice.append(price[0])
def scrape(url): try: req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) except ValueError: return([]) webpage = urlopen(req).read() soup = s(webpage, "lxml") title = soup.find('title') body = soup.find_all('p') textTitle = title.get_text().replace(',','') textBody = '' for text in body: textBody = textBody + text.get_text().replace(',','') textBody = textBody.replace('\n', '') blog = [textTitle,textBody] return(blog)
def getProducts(url): #print(url) request = requests.get(url) soup = s(request.content, 'html.parser') li = soup.find_all('li') items = [ i for i in li if 'class' in i.attrs.keys() and i['class'] == ['item', 'last'] ] for i in items: #print(i) pUrl.append(i.a['href']) pName.append(i.a['title']) #print(i.a['title']) pIURL.append(i.img['src']) p = i.find_all('p') #print(p) para = [ j for j in p if 'class' in j.attrs.keys() and j['class'] == ['special-price'] ] if len(para) == 0: p = i.find_all('div') para = [ j for j in p if 'class' in j.attrs.keys() and j['class'] == ['price-box'] ] #print(para) price = [j for j in para[0].text.split('\n') if '$' in j] pPrice.append(price[0])
def main(): for i in range(1, 3): #start page, 50 entries per page startPage = i #number of blog posts on page request, 50 max numPosts = 50 #variable names url = 'https://hotair.com/page/' + str(startPage) + '?ordinal=' + str( numPosts) req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "html.parser") links = soup.find_all(class_='wp-card__img mt-2') linkList = [] #loop that calls individual blog post urls for link in links: linkList.append( str('https://hotair.com') + link.find('a').get('href')) #loop that scrapes individual bloog posts for link in linkList: scrape(link) return ()
def parser(url): soup = s(webPage(url), 'html.parser') h3 = soup.find_all('h3') l = len(h3) if l != 0: h3 = h3[:20] # Each page contains 20 items only . for tag in h3: pName.append(str(tag.string)) tags = getParentTags(tag) pSeller.append(tags[1].a.text) pSUrl.append("https://www.google.com" + tags[0].a["href"]) pPrice.append(tags[1].span.span.text) if tags[2]['class'] == ['hBUZL', 'Rv2Cae']: for j in tags[2].find_all("span"): if j.text != "": pTR.append(j.text.split()[0]) pRating.append(tags[2].span.div["aria-label"]) pDescription.append(tags[3].text) if len(tags) > 4 and tags[4]["class"] == ['hBUZL']: ptags.append(','.join(tags[4].text.split('·'))) else: ptags.append('') else: pTR.append('') pDescription.append(tags[2].text) ptags.append('') pRating.append('') return True else: return False
def scrape(url,status,root): req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "lxml") title = soup.find_all('title') date = soup.find_all(class_="timestamp__date--published") author = soup.find_all(class_="author-card__link yr-author-name") body = soup.find_all('p') body = body[:-1] textAuthor = '' # saves title, date, author to text textTitle = title[0].get_text()[:-11].replace(',','') textDate = date[0].get_text()[:-12] if len(author) ==0: textAuthor = 'Associated Press' else: textAuthor = author[0].get_text() # formats date with YYYY-MM-DD textDate = textDate[7:11]+"-"+textDate[1:3]+"-"+textDate[4:6] textBody = textTitle + ', '+ textAuthor + ", " + textDate + ", " # cleans text body for i in body: textBody = textBody + i.get_text().replace(',','') + " " textBody = 'L, ' + textBody + '\n' blogTitle = 'L HuffPost '+str(textAuthor)+ " " + str(textTitle) + ".csv" invalid = '<>:\"\\|?*\'/' for char in invalid: blogTitle = blogTitle.replace(char,'') step = '[+] HP: '+textTitle status['text'] = "{}".format(step) root.update() # writes file file = open(sys.path[0]+"/Blogs/SavedBlogs/"+blogTitle , "w+", encoding = 'utf-8') file.write(textBody) file.close() return()
def scrape(url, status, root): req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "lxml") title = soup.find('title') date = re.findall( r'story\/[0-9][0-9][0-9][0-9]\/*[0-9]*[0-9]\/*[0-9]*[0-9]', url) author = soup.find(class_='author-name') body = soup.find_all('p') textTitle = title.get_text().replace(',', '') textDate = date[0][6:].replace('/', '-') if (textDate[6] == '-'): textDate = textDate[:5] + '0' + textDate[5:] if (len(textDate) == 9): textDate = textDate[:8] + '0' + textDate[8:] textAuthor = author.get_text().replace('\n', '') textBody = textTitle + ', ' + textAuthor + ", " + textDate + ", " for text in body: textBody = textBody + text.get_text().replace(',', '') + ' ' textBody = textBody.replace('\n', '') textBody = 'L, ' + textBody + '\n' blogTitle = 'L DailyKos ' + textAuthor + ' ' + textTitle + '.csv' invalid = '<>:\"\\|?*\'/\n' for char in invalid: blogTitle = blogTitle.replace(char, '') step = "[+] DK: " + textTitle status['text'] = "{}".format(step) root.update() file = open(sys.path[0] + "/Blogs/SavedBlogs/" + blogTitle, "w+", encoding='utf-8') file.write(textBody) file.close() return ("DailyKos: " + textTitle)
def scrape(url,status,root): req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "lxml") #variable names title = soup.find('title') date = re.findall(r'[0-9][0-9][0-9][0-9]\/[0-9][0-9]\/[0-9][0-9]',url) author = soup.find('meta', attrs={'name':'author'}) body = soup.find_all('p') body = body[:-1] #cleaning of text textTitle = title.get_text().replace(',','')[:-8] textDate = date[0].replace('/','-') textAuthor = author['content'] textBody = textTitle + ', '+ textAuthor + ", " + textDate + ", " #creation of text body for text in body: textBody = textBody + text.get_text().replace(',','')+' ' textBody = 'C, ' + textBody + '\n' #creation of file and file name blogTitle = 'C HotAir '+textAuthor+' '+textTitle+'.csv' invalid = '<>:\"\\|?*\'/\n' for char in invalid: blogTitle = blogTitle.replace(char,'') #terminal output step = '[+] HA: '+textTitle status['text'] = "{}".format(step) root.update() #wrint of file file = open(sys.path[0]+"/Blogs/SavedBlogs/"+blogTitle , "w+", encoding = 'utf-8') file.write(textBody) file.close() return('HotAir: '+textTitle)
def scrape(url, status, root): # scrap of individual pages req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "lxml") title = soup.find('title') date = soup.find('meta', property='article:published_time') author = soup.find(class_='url fn n') body = soup.find_all('p') # variable cleaner textTitle = title.get_text()[:-13].replace(',', '') textDate = re.findall(r'[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]', (str(date)))[0] textAuthor = author.get_text() textBody = textTitle + ', ' + textAuthor + ", " + textDate + ", " # text body cleaner for text in body: textBody = textBody + text.get_text().replace(',', '') + ' ' textBody = 'C, ' + textBody + '\n' # file name blogTitle = 'C PowerLine ' + textAuthor + ' ' + textTitle + '.csv' invalid = '<>:\"\\|?*\'/' for char in invalid: blogTitle = blogTitle.replace(char, '') step = "[+] PL : " + textTitle status['text'] = "{}".format(step) root.update() # write operation file = open(sys.path[0] + "/Blogs/SavedBlogs/" + blogTitle, "w+", encoding='utf-8') file.write(textBody) file.close() return ()
def main(): # range deals with how many blogs are scraped. change for x in range(0, 3): url = 'https://crooksandliars.com/politics?page=' + str(x) req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "html.parser") links = soup.find_all('h2') links = links[1:] linkList = [] for link in links: linkList.append(link.find('a').get('href')) for link in linkList: scrape('https://crooksandliars.com/' + link) return ()
def scrape(url, status, root): req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "lxml") title = soup.find('title') date = soup.find('time') author = soup.find(class_='nh-footer') body = soup.find_all('p') # clean text for csv textTitle = title.get_text()[:-19].replace(',', '') textDate = (date['datetime'].split()[0]) textAuthor = author.find('a').get_text() textBody = textTitle + ', ' + textAuthor + ", " + textDate + ", " for text in body: textBody = textBody + text.get_text().replace(',', '') + ' ' textBody = textBody.replace('\n', '') textBody = 'L, ' + textBody + '\n' # blog title name blogTitle = 'L CrooksAndLiars ' + textAuthor + ' ' + textTitle + '.csv' invalid = '<>:\"\\|?*\'/\n' for char in invalid: blogTitle = blogTitle.replace(char, '') step = "[+] C&L: " + textTitle status['text'] = "{}".format(step) root.update() # write file file = open(sys.path[0] + "/Blogs/SavedBlogs/" + blogTitle, "w+", encoding='utf-8') file.write(textBody) file.close() return ('CrooksAndLiars: ' + textTitle)
def main(): # scraper for 50 pages for x in range(0, 5): url = 'https://www.powerlineblog.com/page/' + str(x) req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "html.parser") links = soup.find_all(class_='entry-title') linkList = [] for link in links: linkList.append(link.find('a').get('href')) for link in linkList: scrape(link) return ()
def scrape(url): req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "lxml") # var names title = soup.find('title') date = re.findall(r'[0-9][0-9][0-9][0-9]\/[0-9][0-9]\/[0-9][0-9]',url) author = soup.find('meta', attrs={'name':'author'}) body = soup.find_all('p') body = body[:-1] textTitle = title.get_text().replace(',','')[:-11] textDate = date[0].replace('/','-') textAuthor = author['content'] textBody =textTitle + ', '+ textAuthor + ", " + textDate + ", " # text body cleaner for text in body: textBody = textBody + text.get_text().replace(',','')+' ' textBody = 'C, ' + textBody + '\n' # file name creater blogTitle = 'C RedState '+textAuthor+' '+textTitle+'.csv' invalid = '<>:\"\\|?*\'/' for char in invalid: blogTitle = blogTitle.replace(char,'') print("RedState: "+textTitle) # file write file = open(sys.path[0]+"/SavedBlogs/"+blogTitle , "w+", encoding = 'utf-8') file.write(textBody) file.close() return()
def main(status, root): # blog scraper for x in range(1, 3): url = 'https://www.redstate.com/diaries-list/page/' + str(x) + '/' req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "html.parser") links = soup.find_all(class_='wp-card__img') linkList = [] for link in links: linkList.append( str('https://redstate.com') + link.find('a').get('href')) for link in linkList: scrape(link, status, root) return ()
def main(status, root): # range deals with how many pages get scraped. this would be 10 pages of 50 for x in range(1, 2): url = 'https://www.dailykos.com/part/story/table/by_current?page=' + str( x) req = Request(url, headers={'User-Agent': 'Mozilla/5.0'}) webpage = urlopen(req).read() soup = s(webpage, "html.parser") links = soup.find(class_='styled storiesAsGrid').find_all( class_='title') linkList = [] #print(links) for link in links: linkList.append(link.get('href')) for link in linkList: scrape('https://www.dailykos.com' + link, status, root) time.sleep(.5) return ()
def web_parser(websource): exam_fee_s = '' exam_fee_c = '' amount_st = [0, 0, 0, 0] amount_sc = [0, 0, 0, 0] amount_ct = [0, 0, 0, 0] amount_cc = [0, 0, 0, 0] soup = s(websource, 'html.parser') contentTitle = soup.find('h1', id="hdrTitle").text.strip() table = soup.find('div', id="overview") t = table.text.replace('\n', '').split(' ') t1 = '' for i in range(len(t)): if t[i].strip() != '': t1 = t1 + " " + t[i].strip() reference_no = t1[(t1.find("Reference No") + len("Reference No")):(t1.find("Part of"))].strip() partOf = t1[(t1.find("Part of") + len("Part of")):(t1.find("Duration"))].strip() duration = t1[(t1.find("Duration") + len("Duration")):(t1.find("Course Time"))].strip() if t1.find("pm") != '-1': CourseTime = t1[(t1.find("Course Time") + len("Course Time")):(t1.find("pm") + len("pm"))].strip() else: CourseTime = t1[(t1.find("Course Time") + len("Course Time")):(t1.find("PDU"))].strip() courseIntro = t1[(t1.find("details.") + len("details.")):].strip() classes = soup.find_all('span', {'class': 'accordion_header-class'}) class_dates = soup.find_all('span', {'class': 'accordion_header-date'}) for i in range(len(classes)): classes[i] = classes[i].text for i in range(len(class_dates)): class_dates[i] = class_dates[i].text.strip().replace('\n', ' ') tag = soup.find('div', {'class': 'main-content-entry', 'id': 'tab1'}) keyTakeAways = tag.getText().replace('\n', '').strip().replace( tag.find('h2').getText(), '') tag = soup.find('div', {'class': 'main-content-entry', 'id': 'tab2'}) whoShouldAttend = tag.getText().replace('\n', '').strip().replace( tag.find('h2').getText(), '') tag = soup.find('div', {'class': 'main-content-entry', 'id': 'tab3'}) if tag.getText().find("Fees & Funding") != '-1' and len( tag.find_all('table')) != 0: topicsCovered = '' table = tag.find_all('table') else: topicsCovered = tag.getText().replace('\n', '').strip().replace( tag.find('h2').getText(), '') topicsCovered = tag.getText().replace('\n', '').strip().replace( tag.find('h2').getText(), '') tag = soup.find('div', {'class': 'main-content-entry', 'id': 'tab4'}) table = tag.find_all('table') #Length of the tables can be 2 or more l = len(table) print(l) dimc = 0 if l == 2: w = None rows = table[0].find_all('tr') data = [] dimc = len(rows) for i in rows: cols = i.find_all('td') data.append(cols) dimc = len(data[0]) if dimc == 3: amount_st = get_simple_cfee(table[0]) amount_sc = amount_st exam_fee_s = get_exam_fee(table[1]) exam_fee_c = exam_fee_s amount_ct = ['N/A', 'N/A', 'N/A', 'N/A'] amount_cc = amount_ct else: for i in table[0].descendants: r = re.search("Total nett course fee payable,", str(i.string)) if r is not None: w = i break if w is not None: w = get_parent_td(w) amount_st = get_amount_course(w) else: amount_st = [0, 0, 0, 0] w = None for i in table[0].descendants: r = re.search("Total nett course fee payable,", str(i.string)) if r is not None: w = i if w is not None: w = get_parent_td(w) amount_sc = get_amount_course(w) else: amount_sc = amount_st w = None for i in table[1].descendants: r = re.search("Total nett course fee payable,", str(i.string)) if r is not None: w = i break if w is not None: w = get_parent_td(w) amount_ct = get_amount_course(w) else: amount_ct = [0, 0, 0, 0] w = None for i in table[1].descendants: r = re.search('from the various funding schemes', str(i.string)) if r is not None: w = i break if w is not None: w = get_parent_td(w) amount_cc = get_amount_course(w) for i in range(4): if amount_cc[i] == '-': amount_cc[i] = amount_ct[i] else: amount_cc = amount_ct elif l == 4: print("I am here") w = None for i in table[0].descendants: r = re.search("Total nett course fee payable,", str(i.string)) if r is not None: w = i break if w is not None: w = get_parent_td(w) amount_st = get_amount_course(w) else: amount_st = [0, 0, 0, 0] w = None for i in table[0].descendants: r = re.search("Total nett course fee payable,", str(i.string)) if r is not None: w = i if w is not None: w = get_parent_td(w) amount_sc = get_amount_course(w) else: amount_sc = amount_st w = None for i in table[2].descendants: r = re.search("Total nett course fee payable,", str(i.string)) if r is not None: w = i break if w is not None: w = get_parent_td(w) amount_ct = get_amount_course(w) else: amount_ct = [0, 0, 0, 0] w = None for i in table[2].descendants: r = re.search('from the various funding schemes', str(i.string)) if r is not None: w = i break if w is not None: w = get_parent_td(w) amount_cc = get_amount_course(w) for i in range(4): if amount_cc[i] == '-': amount_cc[i] = amount_ct[i] else: amount_cc = amount_ct exam_fee_s = get_exam_fee(table[1]) exam_fee_c = get_exam_fee(table[3]) elif l == 1: exam_fee_s = get_Aop_fee(table[0]) print(amount_st) parent = '' tag = soup.find_all('div', {'class': 'block-accordion-courses--header'}) for i in tag: if parent == '': parent = i.string else: parent = parent + ", " + i.string for i in range(len(classes)): courseName.append(contentTitle) referenceNo.append(reference_no) coursePartOf.append(partOf) courseDuration.append(duration) courseTime.append(CourseTime) courseIntroduction.append(courseIntro) courseClasses.append(classes[i]) courseDates.append(class_dates[i]) courseTakeAways.append(keyTakeAways) courseAttendes.append(whoShouldAttend) courseTopicsCovered.append(topicsCovered) courseSsip_t.append(amount_st[0]) courseSspr_t.append(amount_st[1]) courseSssfmces_t.append(amount_st[2]) courseSswfts_t.append(amount_st[3]) courseSsip_c.append(amount_sc[0]) courseSspr_c.append(amount_st[1]) courseSssfmces_c.append(amount_st[2]) courseSswfts_c.append(amount_st[3]) courseCsip_t.append(amount_ct[0]) courseCspr_t.append(amount_ct[1]) courseCssfmces_t.append(amount_ct[2]) courseCswfts_t.append(amount_ct[3]) courseCsip_c.append(amount_cc[0]) courseCspr_c.append(amount_cc[1]) courseCssfmces_c.append(amount_cc[2]) courseCswfts_c.append(amount_cc[3]) courseExamS.append(exam_fee_s) courseExamC.append(exam_fee_c) courseParent.append(parent)