Example #1
0
def main():
	b = session.get(url,headers=hed)
	data = []
	nomor = -1
	sop = s(b.content, 'html.parser')
	for x in sop("a", class_="search-result"):
		nomor += 1
		data.append([x.text, x['href']])
		print(str([nomor]),x.text)
	inp = int(input('[√] Pilih No provinsi : '))
	v = data[inp][1]
	z = r.get("https://www.accuweather.com/"+v,headers=hed)
	daa = []
	nomor = -1
	sp = s(z.content, 'html.parser')
	for xe in sp("a", class_="search-result"):
		nomor += 1
		daa.append([xe.content, xe['href']])
		print(str([nomor]),xe.text)
	inp = int(input('[√] Pilih No Daerah : '))
	m = daa[inp][1]
	h = r.get("https://www.accuweather.com/"+m, headers=hed)
	sg = s(h.text, 'html.parser')
	nomor = 0
	for xc in sg("div",class_="card weather-card content-module non-ad"):
		nomor = 1
		x = xc.text
		z = x.strip()
		print(str(nomor),z)
	it = input("Tekan Enter untuk Kembali Ke menu...")
	main()
Example #2
0
def getProducts(url):
    #print(url)
    request = requests.get(url)
    soup = s(request.content,'html.parser')
    li = soup.find_all('li')
    items = [i for i in li if 'class' in i.attrs.keys() and i['class']==['item', 'last']]
    for i in items:
        #print(i)
        #Adding only the unique set of product name
        if re.sub('- \w+',"",re.sub("\(.*\)","",i.a['title']).strip()).strip().lower() not in plist:
            
            pUrl.append(i.a['href'])
            pName.append(re.sub('- \w+',"",re.sub("\(.*\)","",i.a['title']).strip()).strip())
            plist.append(re.sub('- \w+',"",re.sub("\(.*\)","",i.a['title']).strip()).strip().lower())
            #print(i.a['title'])
            pIURL.append(i.img['src'])
            p = i.find_all('p')
            #print(p)
            para = [j for j in p if 'class' in j.attrs.keys() and j['class']==['special-price']]
            if len(para)==0:
                p = i.find_all('div')
                para = [j for j in p if 'class' in j.attrs.keys() and j['class']==['price-box']]
                #print(para)
            price = [j  for j in para[0].text.split('\n') if '$' in j]
            pPrice.append(price[0])
def scrape(url):
	
	try:
		req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
	except ValueError:
		return([])

	webpage = urlopen(req).read()

	soup = s(webpage, "lxml")

	title = soup.find('title')
	body = soup.find_all('p')
	
	textTitle = title.get_text().replace(',','')
	textBody = '' 
    
	for text in body:
		textBody = textBody + text.get_text().replace(',','')
		textBody = textBody.replace('\n', '')

	blog = [textTitle,textBody]
	
	
	return(blog)
Example #4
0
def getProducts(url):
    #print(url)
    request = requests.get(url)
    soup = s(request.content, 'html.parser')
    li = soup.find_all('li')
    items = [
        i for i in li
        if 'class' in i.attrs.keys() and i['class'] == ['item', 'last']
    ]
    for i in items:
        #print(i)
        pUrl.append(i.a['href'])
        pName.append(i.a['title'])
        #print(i.a['title'])
        pIURL.append(i.img['src'])
        p = i.find_all('p')
        #print(p)
        para = [
            j for j in p
            if 'class' in j.attrs.keys() and j['class'] == ['special-price']
        ]
        if len(para) == 0:
            p = i.find_all('div')
            para = [
                j for j in p
                if 'class' in j.attrs.keys() and j['class'] == ['price-box']
            ]
        #print(para)
        price = [j for j in para[0].text.split('\n') if '$' in j]
        pPrice.append(price[0])
def main():

    for i in range(1, 3):
        #start page, 50 entries per page
        startPage = i
        #number of blog posts on page request, 50 max
        numPosts = 50

        #variable names
        url = 'https://hotair.com/page/' + str(startPage) + '?ordinal=' + str(
            numPosts)
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = s(webpage, "html.parser")
        links = soup.find_all(class_='wp-card__img mt-2')
        linkList = []

        #loop that calls individual blog post urls
        for link in links:
            linkList.append(
                str('https://hotair.com') + link.find('a').get('href'))

        #loop that scrapes individual bloog posts
        for link in linkList:
            scrape(link)

    return ()
Example #6
0
def parser(url):
    soup = s(webPage(url), 'html.parser')
    h3 = soup.find_all('h3')
    l = len(h3)
    if l != 0:
        h3 = h3[:20]  # Each page contains 20 items only .
        for tag in h3:
            pName.append(str(tag.string))
            tags = getParentTags(tag)
            pSeller.append(tags[1].a.text)
            pSUrl.append("https://www.google.com" + tags[0].a["href"])
            pPrice.append(tags[1].span.span.text)
            if tags[2]['class'] == ['hBUZL', 'Rv2Cae']:
                for j in tags[2].find_all("span"):
                    if j.text != "":
                        pTR.append(j.text.split()[0])
                pRating.append(tags[2].span.div["aria-label"])
                pDescription.append(tags[3].text)
                if len(tags) > 4 and tags[4]["class"] == ['hBUZL']:
                    ptags.append(','.join(tags[4].text.split('·')))
                else:
                    ptags.append('')

            else:
                pTR.append('')
                pDescription.append(tags[2].text)
                ptags.append('')
                pRating.append('')
        return True
    else:
        return False
def scrape(url,status,root):

	req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

	webpage = urlopen(req).read()

	soup = s(webpage, "lxml")

	title = soup.find_all('title')
	date = soup.find_all(class_="timestamp__date--published")
	author = soup.find_all(class_="author-card__link yr-author-name")
	body = soup.find_all('p')
	body = body[:-1]
	
	
	textAuthor = ''
	# saves title, date, author to text
	textTitle = title[0].get_text()[:-11].replace(',','')
	
	
	textDate = date[0].get_text()[:-12]
		
	if len(author) ==0:
		textAuthor = 'Associated Press'			
	else:	
		textAuthor = author[0].get_text()

	# formats date with YYYY-MM-DD
	textDate = textDate[7:11]+"-"+textDate[1:3]+"-"+textDate[4:6]

	textBody = textTitle + ', '+ textAuthor + ", " + textDate + ", "

	# cleans text body
	for i in body:
		textBody = textBody + i.get_text().replace(',','') + " "

	textBody = 'L, ' +  textBody + '\n'
	
	blogTitle = 'L HuffPost '+str(textAuthor)+ " " + str(textTitle) + ".csv"
	invalid = '<>:\"\\|?*\'/'
	for char in invalid:
		blogTitle = blogTitle.replace(char,'')

	step = '[+] HP: '+textTitle
	status['text'] = "{}".format(step)
	root.update()

	# writes file
	file = open(sys.path[0]+"/Blogs/SavedBlogs/"+blogTitle , "w+", encoding = 'utf-8')
	file.write(textBody)
	file.close()

	return()
def scrape(url, status, root):

    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

    webpage = urlopen(req).read()

    soup = s(webpage, "lxml")

    title = soup.find('title')
    date = re.findall(
        r'story\/[0-9][0-9][0-9][0-9]\/*[0-9]*[0-9]\/*[0-9]*[0-9]', url)
    author = soup.find(class_='author-name')
    body = soup.find_all('p')

    textTitle = title.get_text().replace(',', '')
    textDate = date[0][6:].replace('/', '-')

    if (textDate[6] == '-'):
        textDate = textDate[:5] + '0' + textDate[5:]

    if (len(textDate) == 9):
        textDate = textDate[:8] + '0' + textDate[8:]

    textAuthor = author.get_text().replace('\n', '')

    textBody = textTitle + ', ' + textAuthor + ", " + textDate + ", "

    for text in body:
        textBody = textBody + text.get_text().replace(',', '') + ' '
        textBody = textBody.replace('\n', '')

    textBody = 'L, ' + textBody + '\n'

    blogTitle = 'L DailyKos ' + textAuthor + ' ' + textTitle + '.csv'
    invalid = '<>:\"\\|?*\'/\n'
    for char in invalid:
        blogTitle = blogTitle.replace(char, '')

    step = "[+] DK: " + textTitle
    status['text'] = "{}".format(step)
    root.update()

    file = open(sys.path[0] + "/Blogs/SavedBlogs/" + blogTitle,
                "w+",
                encoding='utf-8')
    file.write(textBody)
    file.close()

    return ("DailyKos: " + textTitle)
Example #9
0
def scrape(url,status,root):

	req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

	webpage = urlopen(req).read()

	soup = s(webpage, "lxml")	

	#variable names
	title = soup.find('title')
	date = re.findall(r'[0-9][0-9][0-9][0-9]\/[0-9][0-9]\/[0-9][0-9]',url)
	author = soup.find('meta', attrs={'name':'author'})
	body = soup.find_all('p')
	body = body[:-1]

	#cleaning of text
	textTitle = title.get_text().replace(',','')[:-8]
	textDate = date[0].replace('/','-')
	textAuthor = author['content']

	textBody = textTitle + ', '+ textAuthor + ", " + textDate + ", "

	#creation of text body
	for text in body:
		textBody = textBody + text.get_text().replace(',','')+' '

	textBody = 'C, ' + textBody + '\n'
	
	#creation of file and file name
	blogTitle = 'C HotAir '+textAuthor+' '+textTitle+'.csv'
	invalid = '<>:\"\\|?*\'/\n'
	for char in invalid:
		blogTitle = blogTitle.replace(char,'')
	
	#terminal output
	step = '[+] HA: '+textTitle
	status['text'] = "{}".format(step)
	root.update()

	#wrint of file
	file = open(sys.path[0]+"/Blogs/SavedBlogs/"+blogTitle , "w+", encoding = 'utf-8')
	file.write(textBody)
	file.close()
	

	return('HotAir: '+textTitle)
Example #10
0
def scrape(url, status, root):

    # scrap of individual pages
    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

    webpage = urlopen(req).read()

    soup = s(webpage, "lxml")

    title = soup.find('title')
    date = soup.find('meta', property='article:published_time')
    author = soup.find(class_='url fn n')
    body = soup.find_all('p')

    # variable cleaner
    textTitle = title.get_text()[:-13].replace(',', '')
    textDate = re.findall(r'[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]',
                          (str(date)))[0]
    textAuthor = author.get_text()

    textBody = textTitle + ', ' + textAuthor + ", " + textDate + ", "

    # text body cleaner
    for text in body:
        textBody = textBody + text.get_text().replace(',', '') + ' '

    textBody = 'C, ' + textBody + '\n'

    # file name
    blogTitle = 'C PowerLine ' + textAuthor + ' ' + textTitle + '.csv'
    invalid = '<>:\"\\|?*\'/'
    for char in invalid:
        blogTitle = blogTitle.replace(char, '')

    step = "[+] PL : " + textTitle
    status['text'] = "{}".format(step)
    root.update()

    # write operation
    file = open(sys.path[0] + "/Blogs/SavedBlogs/" + blogTitle,
                "w+",
                encoding='utf-8')
    file.write(textBody)
    file.close()

    return ()
Example #11
0
def main():

    # range deals with how many blogs are scraped. change
    for x in range(0, 3):
        url = 'https://crooksandliars.com/politics?page=' + str(x)
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = s(webpage, "html.parser")
        links = soup.find_all('h2')
        links = links[1:]
        linkList = []

        for link in links:
            linkList.append(link.find('a').get('href'))

        for link in linkList:
            scrape('https://crooksandliars.com/' + link)

    return ()
Example #12
0
def scrape(url, status, root):

    req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

    webpage = urlopen(req).read()

    soup = s(webpage, "lxml")

    title = soup.find('title')
    date = soup.find('time')
    author = soup.find(class_='nh-footer')
    body = soup.find_all('p')

    # clean text for csv
    textTitle = title.get_text()[:-19].replace(',', '')
    textDate = (date['datetime'].split()[0])
    textAuthor = author.find('a').get_text()
    textBody = textTitle + ', ' + textAuthor + ", " + textDate + ", "

    for text in body:
        textBody = textBody + text.get_text().replace(',', '') + ' '
        textBody = textBody.replace('\n', '')

    textBody = 'L, ' + textBody + '\n'

    # blog title name
    blogTitle = 'L CrooksAndLiars ' + textAuthor + ' ' + textTitle + '.csv'
    invalid = '<>:\"\\|?*\'/\n'
    for char in invalid:
        blogTitle = blogTitle.replace(char, '')

    step = "[+] C&L: " + textTitle
    status['text'] = "{}".format(step)
    root.update()

    # write file
    file = open(sys.path[0] + "/Blogs/SavedBlogs/" + blogTitle,
                "w+",
                encoding='utf-8')
    file.write(textBody)
    file.close()

    return ('CrooksAndLiars: ' + textTitle)
Example #13
0
def main():

    # scraper for 50 pages
    for x in range(0, 5):
        url = 'https://www.powerlineblog.com/page/' + str(x)
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = s(webpage, "html.parser")
        links = soup.find_all(class_='entry-title')

        linkList = []

        for link in links:
            linkList.append(link.find('a').get('href'))

        for link in linkList:
            scrape(link)

    return ()
Example #14
0
def scrape(url):
	
	req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

	webpage = urlopen(req).read()

	soup = s(webpage, "lxml")	

	# var names
	title = soup.find('title')
	date = re.findall(r'[0-9][0-9][0-9][0-9]\/[0-9][0-9]\/[0-9][0-9]',url)
	author = soup.find('meta', attrs={'name':'author'})
	body = soup.find_all('p')
	body = body[:-1]
	
	textTitle = title.get_text().replace(',','')[:-11]
	textDate = date[0].replace('/','-')
	textAuthor = author['content']
	
	textBody =textTitle + ', '+ textAuthor + ", " + textDate + ", "

	# text body cleaner
	for text in body:
		textBody = textBody + text.get_text().replace(',','')+' '

	textBody = 'C, ' + textBody + '\n'

	# file name creater
	blogTitle = 'C RedState '+textAuthor+' '+textTitle+'.csv'
	invalid = '<>:\"\\|?*\'/'
	for char in invalid:
		blogTitle = blogTitle.replace(char,'')
	
	print("RedState: "+textTitle)

	# file write
	file = open(sys.path[0]+"/SavedBlogs/"+blogTitle , "w+", encoding = 'utf-8')
	file.write(textBody)
	file.close()
	

	return()
Example #15
0
def main(status, root):

    # blog scraper
    for x in range(1, 3):
        url = 'https://www.redstate.com/diaries-list/page/' + str(x) + '/'
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = s(webpage, "html.parser")
        links = soup.find_all(class_='wp-card__img')

        linkList = []

        for link in links:
            linkList.append(
                str('https://redstate.com') + link.find('a').get('href'))

        for link in linkList:
            scrape(link, status, root)

    return ()
def main(status, root):

    # range deals with how many pages get scraped. this would be 10 pages of 50
    for x in range(1, 2):
        url = 'https://www.dailykos.com/part/story/table/by_current?page=' + str(
            x)
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        soup = s(webpage, "html.parser")
        links = soup.find(class_='styled storiesAsGrid').find_all(
            class_='title')
        linkList = []
        #print(links)

        for link in links:
            linkList.append(link.get('href'))

        for link in linkList:
            scrape('https://www.dailykos.com' + link, status, root)
            time.sleep(.5)

    return ()
Example #17
0
def web_parser(websource):
    exam_fee_s = ''
    exam_fee_c = ''
    amount_st = [0, 0, 0, 0]
    amount_sc = [0, 0, 0, 0]
    amount_ct = [0, 0, 0, 0]
    amount_cc = [0, 0, 0, 0]
    soup = s(websource, 'html.parser')
    contentTitle = soup.find('h1', id="hdrTitle").text.strip()
    table = soup.find('div', id="overview")
    t = table.text.replace('\n', '').split('      ')
    t1 = ''
    for i in range(len(t)):
        if t[i].strip() != '':
            t1 = t1 + " " + t[i].strip()
    reference_no = t1[(t1.find("Reference No") +
                       len("Reference No")):(t1.find("Part of"))].strip()
    partOf = t1[(t1.find("Part of") +
                 len("Part of")):(t1.find("Duration"))].strip()
    duration = t1[(t1.find("Duration") +
                   len("Duration")):(t1.find("Course Time"))].strip()
    if t1.find("pm") != '-1':
        CourseTime = t1[(t1.find("Course Time") +
                         len("Course Time")):(t1.find("pm") +
                                              len("pm"))].strip()
    else:
        CourseTime = t1[(t1.find("Course Time") +
                         len("Course Time")):(t1.find("PDU"))].strip()
    courseIntro = t1[(t1.find("details.") + len("details.")):].strip()
    classes = soup.find_all('span', {'class': 'accordion_header-class'})
    class_dates = soup.find_all('span', {'class': 'accordion_header-date'})
    for i in range(len(classes)):
        classes[i] = classes[i].text
    for i in range(len(class_dates)):
        class_dates[i] = class_dates[i].text.strip().replace('\n', ' ')
    tag = soup.find('div', {'class': 'main-content-entry', 'id': 'tab1'})
    keyTakeAways = tag.getText().replace('\n', '').strip().replace(
        tag.find('h2').getText(), '')
    tag = soup.find('div', {'class': 'main-content-entry', 'id': 'tab2'})
    whoShouldAttend = tag.getText().replace('\n', '').strip().replace(
        tag.find('h2').getText(), '')
    tag = soup.find('div', {'class': 'main-content-entry', 'id': 'tab3'})
    if tag.getText().find("Fees & Funding") != '-1' and len(
            tag.find_all('table')) != 0:
        topicsCovered = ''
        table = tag.find_all('table')
    else:
        topicsCovered = tag.getText().replace('\n', '').strip().replace(
            tag.find('h2').getText(), '')
        topicsCovered = tag.getText().replace('\n', '').strip().replace(
            tag.find('h2').getText(), '')
        tag = soup.find('div', {'class': 'main-content-entry', 'id': 'tab4'})
        table = tag.find_all('table')

    #Length of the tables can be 2 or more

    l = len(table)
    print(l)
    dimc = 0
    if l == 2:

        w = None
        rows = table[0].find_all('tr')
        data = []
        dimc = len(rows)
        for i in rows:
            cols = i.find_all('td')
            data.append(cols)
        dimc = len(data[0])

        if dimc == 3:
            amount_st = get_simple_cfee(table[0])
            amount_sc = amount_st
            exam_fee_s = get_exam_fee(table[1])
            exam_fee_c = exam_fee_s
            amount_ct = ['N/A', 'N/A', 'N/A', 'N/A']
            amount_cc = amount_ct

        else:
            for i in table[0].descendants:
                r = re.search("Total nett course fee payable,", str(i.string))
                if r is not None:
                    w = i
                    break
            if w is not None:
                w = get_parent_td(w)
                amount_st = get_amount_course(w)
            else:
                amount_st = [0, 0, 0, 0]

            w = None
            for i in table[0].descendants:
                r = re.search("Total nett course fee payable,", str(i.string))
                if r is not None:
                    w = i
            if w is not None:
                w = get_parent_td(w)
                amount_sc = get_amount_course(w)
            else:
                amount_sc = amount_st

            w = None
            for i in table[1].descendants:
                r = re.search("Total nett course fee payable,", str(i.string))
                if r is not None:
                    w = i
                    break
            if w is not None:
                w = get_parent_td(w)
                amount_ct = get_amount_course(w)
            else:
                amount_ct = [0, 0, 0, 0]

            w = None
            for i in table[1].descendants:
                r = re.search('from the various funding schemes',
                              str(i.string))
                if r is not None:
                    w = i
                    break
            if w is not None:
                w = get_parent_td(w)
                amount_cc = get_amount_course(w)
                for i in range(4):
                    if amount_cc[i] == '-':
                        amount_cc[i] = amount_ct[i]

            else:
                amount_cc = amount_ct

    elif l == 4:
        print("I am here")
        w = None
        for i in table[0].descendants:
            r = re.search("Total nett course fee payable,", str(i.string))
            if r is not None:
                w = i
                break
        if w is not None:
            w = get_parent_td(w)
            amount_st = get_amount_course(w)
        else:
            amount_st = [0, 0, 0, 0]

        w = None
        for i in table[0].descendants:
            r = re.search("Total nett course fee payable,", str(i.string))
            if r is not None:
                w = i

        if w is not None:
            w = get_parent_td(w)
            amount_sc = get_amount_course(w)
        else:
            amount_sc = amount_st

        w = None
        for i in table[2].descendants:
            r = re.search("Total nett course fee payable,", str(i.string))
            if r is not None:
                w = i
                break
        if w is not None:
            w = get_parent_td(w)
            amount_ct = get_amount_course(w)
        else:
            amount_ct = [0, 0, 0, 0]

        w = None
        for i in table[2].descendants:
            r = re.search('from the various funding schemes', str(i.string))
            if r is not None:
                w = i
                break
        if w is not None:
            w = get_parent_td(w)
            amount_cc = get_amount_course(w)
            for i in range(4):
                if amount_cc[i] == '-':
                    amount_cc[i] = amount_ct[i]

        else:
            amount_cc = amount_ct
        exam_fee_s = get_exam_fee(table[1])
        exam_fee_c = get_exam_fee(table[3])
    elif l == 1:
        exam_fee_s = get_Aop_fee(table[0])
    print(amount_st)
    parent = ''
    tag = soup.find_all('div', {'class': 'block-accordion-courses--header'})
    for i in tag:
        if parent == '':
            parent = i.string
        else:
            parent = parent + ", " + i.string
    for i in range(len(classes)):
        courseName.append(contentTitle)
        referenceNo.append(reference_no)
        coursePartOf.append(partOf)
        courseDuration.append(duration)
        courseTime.append(CourseTime)
        courseIntroduction.append(courseIntro)
        courseClasses.append(classes[i])
        courseDates.append(class_dates[i])
        courseTakeAways.append(keyTakeAways)
        courseAttendes.append(whoShouldAttend)
        courseTopicsCovered.append(topicsCovered)
        courseSsip_t.append(amount_st[0])
        courseSspr_t.append(amount_st[1])
        courseSssfmces_t.append(amount_st[2])
        courseSswfts_t.append(amount_st[3])
        courseSsip_c.append(amount_sc[0])
        courseSspr_c.append(amount_st[1])
        courseSssfmces_c.append(amount_st[2])
        courseSswfts_c.append(amount_st[3])
        courseCsip_t.append(amount_ct[0])
        courseCspr_t.append(amount_ct[1])
        courseCssfmces_t.append(amount_ct[2])
        courseCswfts_t.append(amount_ct[3])
        courseCsip_c.append(amount_cc[0])
        courseCspr_c.append(amount_cc[1])
        courseCssfmces_c.append(amount_cc[2])
        courseCswfts_c.append(amount_cc[3])
        courseExamS.append(exam_fee_s)
        courseExamC.append(exam_fee_c)
        courseParent.append(parent)