Ejemplo n.º 1
0
 def getSoup(self):
     response = R.get(self.strURL,
                      headers={'User-Agent': UserAgent().chrome})
     self.soup = BS(response.content, "html.parser")
Ejemplo n.º 2
0
from requests import Session

s = Session()

kk = open('t.txt', 'r').read()
w = open('link_t_shirt.txt', 'w')
kk = kk.split('\n')
m = 'http://www.esuppliersindia.com'
#kk=['http://www.esuppliersindia.com/suppliers/apparel-fashion/t-shirts/']
for i in kk:
    #     print(i)
    num = 2
    n = '?page_no=' + str(num)
    r = s.post(i)
    #     print(r)
    soup = BS(r.content, 'html.parser')

    num = 2
    while (soup.find('title').text):
        #if(num==420):
        #    num=766
        #   break;
        ll = soup.find_all('td', 'bluebg')
        for j in ll:
            #print(j)
            l = j.find('a')
            #nl.append(r.url+' : '+m+l.get('href'))
            w.write(r.url + ' : ' + m + l.get('href') + '\n')
        print(r.url)
        n = '?page_no=' + str(num)
        r = s.post(i + n)
def downloadImage(ref, folder, n):
    html = urllib.request.urlopen(urllib.request.Request(ref, headers=hdr))
    soup = BS(html)
    img = soup.find(id="img").attrs['src']
    r = requests.get(img, allow_redirects=True)
    open("./" + folder + "/" + str(n) + ".jpg", 'wb').write(r.content)
for uni in unis:

    # getting the page
    # make url vars
    "https://www.theuniguide.co.uk/search/course?utf8=%E2%9C%93&c%5Bq%5D=" + str(
        off_course_name
    ) + "&c%5Bacademic_years%5D=2021&c%5Binstitution_slug%5D%5B%5D=" + uni.name.lower(
    ) + "-" + uni.code + "&c%5Bsort%5D=relevance"
    link = str(link_p)

    url = requests.get(link)

    src = url.content  # getting the html of page

    soup = BS(src, "lxml")  # making the html parsable

    # extracting info

    name = uni.name.title()
    name_txt = name.replace("-", " ")

    try:
        courses_type = soup.find_all("div", {"class": "course-snippets"})

        # selecting a Bachelors degree

        if "B" in courses_type[0].text:
            course_text = courses_type[0].text
            num = 0
Ejemplo n.º 5
0
session = requests.Session()
headers = {
    'User-Agent':
    'Mozilla/5.0 (Windows NT 5.1; rv:47.0) Gecko/20100101 Firefox/47.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
}
base_url = 'https://www.work.ua/jobs-kyiv-python/'

domain = 'https://www.work.ua'
jobs = []
urls = []
urls.append(base_url)
req = session.get(base_url, headers=headers)
if req.status_code == 200:
    bsObj = BS(req.content, "html.parser")
    pagination = bsObj.find('ul', attrs={'class': 'pagination'})
    if pagination:
        pages = pagination.find_all('li', attrs={'class': False})
        for page in pages:
            urls.append(domain + page.a['href'])

for url in urls:
    time.sleep(2)
    req = session.get(url, headers=headers)
    if req.status_code == 200:
        bsObj = BS(req.content, "html.parser")
        div_list = bsObj.find_all('div', attrs={'class': 'job-link'})
        for div in div_list:
            title = div.find('h2')
            href = title.a['href']
Ejemplo n.º 6
0
import bs4
from bs4 import BeautifulSoup as BS
from urllib.request import urlopen as ureq

uClient = ureq("https://uwaterloo.ca/")
thepage = uClient.read()
uClient.close()
soupdata = BS(thepage, 'lxml')

soup = BS("https://uwaterloo.ca/", 'lxml')
for img in soup.findAll("img"):
    temp = img.get("src")
    if temp[:1] == "/":
        image = "https://uwaterloo.ca/" + temp
    else:
        image = temp

    nametemp = img.get("alt")
    if len(nametemp) == 0:
        filename = str(i)
        i += 1
    else:
        filename = nametemp

    imagefile = open(filename + ".jpeg", "wb")
    imagefile.write(ureq(image).read())
    imagefile.close()
Ejemplo n.º 7
0
def Soup(content):
    from bs4 import BeautifulSoup as BS
    return BS(content, "html.parser")
def get_hotel_review(url):

    print('Get hotel review url.')    
    html = requests.get(url)
    soup = BS(html.content,'html.parser')
    page_no = []
    temp_url = []
    
    for l1 in soup.findAll('div', {'id':"REVIEWS"}):
        for l2 in l1.findAll('span', {'class':"pageNum last taLnk "}):
            page_no.append(l2.get('data-page-number'))
    print('Review pages: ' + page_no[0])   

    # Get review link ###    
    for l1 in soup.findAll('div', {'class':"quote"}):
        container = l1.find('a')
        temp_url.append('https://www.tripadvisor.com.sg' + container['href'])    
    
    userReviewURL = []
    userReviewURL.append(temp_url[0])

    # To loop through all reviews pages
    for i in range(int(page_no[0])-1):          # To use this line when running full scrap (all pages of reviews)
    #for i in range(2):                         # To use this line when running partial scrap for debug
        html = requests.get(userReviewURL[i])
        print(userReviewURL[i])
        soup = BS(html.content,'html.parser')
        container = soup.find('a',{'data-page-number':i+2})
        urlTemp = 'https://www.tripadvisor.com.sg' + container['href']
        userReviewURL.append(urlTemp)    

    # Read review ###    
    print('Reading reviews.')  
    temp = []
    uids = []

    global names
    global ratings
    global dates
    global titles
    global bodies
    global recommendTitles
    global recommendAnswers    
    names[:] = []
    ratings[:] = []
    dates[:] = []
    titles[:] = []
    bodies[:] = []
    recommendTitles[:] = []
    recommendAnswers[:] = []

    for i in range(len(userReviewURL)):
        html = requests.get(userReviewURL[i])
        soup = BS(html.content,'html.parser')
        container = soup.find('div',{'id':'SHOW_USER_REVIEW'})    
        print('Parsing url : ' + userReviewURL[i])

        for j in range(5):
            temp = container.findAll('div',{'id':re.compile('^review_')})[j]
            name = temp.find('span',{'class':re.compile('^expand_inline')})
            if(name is None):
                continue
            rating = temp.find('span',{'class':re.compile('^ui_bubble_rating')})['class'][1]
            date = temp.find('span',{'class':'ratingDate'}).next_element
                
            if j == 0:
                title = temp.find('div',{'property':'name'})
                body = temp.find('p',{'property':'reviewBody'})
            else:
                title = temp.find('span',{'class':'noQuotes'})
                body = temp.find('p',{'id':re.compile('^review_')})
            recommendTitle = temp.find('span',{'class':'recommend-titleInline'})
            recommendAnswer = temp.findAll('li',{'class':'recommend-answer'})
            memberInfo = temp.find('div',{'class':'member_info'})

            memberOverlayLink = memberInfo.find('div',{'class':'memberOverlayLink'})
            if(memberOverlayLink is not None):
                uid = memberOverlayLink['id']
                print('uid : ' + uid)
            else:
                uid = ""
        
            if name is not None and len(name) > 0:
                names.append(name.text)
            else:
                names.append('')
            if rating is not None and len(rating) > 0:
                ratings.append(rating[7])
            else:
                ratings.append('')
            if date is not None and len(date) > 0:
                dates.append(date)
            else:
                dates.append('')
            if title is not None and len(title) > 0:
                titles.append(title.text)
            else:
                titles.append('')
            if body is not None and len(body) > 0:
                bodies.append(body.text.strip('\n'))
            else:
                bodies.append('')
            if recommendTitle is not None and len(recommendTitle) > 0:
                recommendTitles.append(recommendTitle.text)
            else:
                recommendTitles.append('')
            if recommendAnswer is not None and len(recommendAnswer) > 0:
                jsonTemp = {}
                for k in range(len(recommendAnswer)):
                    jsonTemp[recommendAnswer[k].text.strip('\n')] = recommendAnswer[k].find('span')['alt'][0]
                recommendAnswers.append(json.dumps(jsonTemp))
            else:
                recommendAnswers.append('')
            if uid is not None and len(uid) > 0:
                uids.append(uid[4:uid.find('-SRC')])
            else:
                uids.append('')
    
    write_to_mongoDB("user_review")   
    get_member_profile(uids)
def get_member_profile(uids):
    
    print('Get member profile.')    
    print(uids)
    memberOverlayLink = []
    memberProfileURL = []

    for i in range(len(uids)):
        if len(uids[i]) > 0:
            memberOverlayLink.append('https://www.tripadvisor.com.sg/MemberOverlay?Mode=owa&uid=' + str(uids[i]))
            memberProfileURL.append('https://www.tripadvisor.com.sg/MemberProfile-a_uid.' + str(uids[i]))
        else:
            memberOverlayLink.append('')
            memberProfileURL.append('')

    global ageGenders
    global hometowns
    global travelStyleTags
    global points
    global levels
    global usernames
    global ages
    global genders
    
    ageGenders[:] = []
    hometowns[:] = []
    travelStyleTags[:] = []
    points[:] = []
    levels[:] = []
    usernames[:] = []
    ages[:] = []
    genders[:] = []
    
    for i in range(len(memberProfileURL)):
        if(memberProfileURL[i] is not None and len(memberProfileURL[i]) > 0):
            print('memberProfileURL[i] : '  + memberProfileURL[i])
            html = requests.get(memberProfileURL[i])
            soup = BS(html.content,'html.parser')
            container = soup.find('div',{'id':'MODULES_MEMBER_CENTER'})
            if(container is not None):
                ageGender = container.find('div',{'class':'ageSince'})
                hometown = container.find('div',{'class':'hometown'})
                travelStyleTag = container.findAll('div',{'class':'tagBubble unclickable'})
                point = container.find('div',{'class':'points'})
                level = container.find('div',{'class':'level tripcollectiveinfo'})
                username = container.find('span',{'class':'nameText'})
                try:
                    print("username: "******"unable to show username due to unicode text")                    
                if len(ageGender) > 0:
                    ageGenders.append(ageGender.text[14:].strip())
                    splitAgeGenders = ageGender.text[14:].strip().split('old')
                    if(len(splitAgeGenders)==1):
                        ages.append(splitAgeGenders[0])
                        genders.append('')
                    elif(len(splitAgeGenders)==2):
                        ages.append(splitAgeGenders[0])
                        genders.append(splitAgeGenders[1])
                else:
                    ageGenders.append('')
                if len(hometown) > 0:
                    hometowns.append(hometown.text)
                else:
                    hometowns.append('')
                if len(travelStyleTag) > 0:
                    listTemp = []
                    for j in range(len(travelStyleTag)):
                        listTemp.append(travelStyleTag[j].text.strip())
                    travelStyleTags.append(listTemp)
                else:
                    travelStyleTags.append('')
                if len(point) > 0:
                    points.append(int(str(point.text.strip()).replace(',','')))
                else:
                    points.append('')
                if level is not None:
                    levels.append(level.text[6:level.text.find(' ',6)].strip())
                else:
                    levels.append('')
                if len(username) > 0:
                    usernames.append(username.text)
                else:
                    usernames.append('')
    write_to_mongoDB("member_profile")
Ejemplo n.º 10
0
    def handle(self, *args, **options):
        post = Post.objects.filter(hide=False,
                                   source__icontains='http://chihuo.org/',
                                   is_approved=False)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
            # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            # 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
            # 'Accept-Encoding': 'none',
            # 'Accept-Language': 'en-US,en;q=0.8',
            # 'Connection': 'keep-alive',
            'referer': 'http://www.yelp.com/',
        }
        for p in post:
            name = p.source.replace('http://chihuo.org/', '')
            name = name.replace('/', '')
            name = name.replace('-', '+')
            keyword = name + '+LA+yelp'
            google_url = 'https://www.google.com/search?q=%s' % keyword

            google_soup = BS(
                urllib2.urlopen(urllib2.Request(google_url,
                                                headers=headers)).read())
            urls = google_soup.select('h3.r a')
            yelp_url = None
            for u in urls:
                if 'yelp.com' in u['href']:
                    yelp_url = u['href']
                    break
            if not yelp_url:
                print 'Not found YELP'
                yelp_url = raw_input('Enter Yelp link: ')

            hot_area = '洛杉矶'
            soup = BS(
                urllib2.urlopen(urllib2.Request(yelp_url,
                                                headers=headers)).read())

            name = soup.find('h1', {
                'class': 'biz-page-title'
            }).get_text().lstrip().rstrip()
            print name
            name2 = raw_input('Enter Chinese name if has: ')

            tags = soup.find('span', {'class': 'category-str-list'}).get_text()
            print tags
            tag = raw_input('Enter tags: ')
            if not tag:
                return

            phone = soup.find('span', {'class': 'biz-phone'}).get_text()
            phone = ''.join([s for s in phone if s.isdigit()])
            street1 = soup.find('span', {
                'itemprop': 'streetAddress'
            }).get_text()
            city_text = soup.find('span', {
                'itemprop': 'addressLocality'
            }).get_text()
            state_text = soup.find('span', {
                'itemprop': 'addressRegion'
            }).get_text()
            postcode_text = soup.find('span', {
                'itemprop': 'postalCode'
            }).get_text()
            map_url = soup.select('a.biz-map-directions img')[0]['src']
            query = urlparse(map_url).query
            query_parsed = parse_qsl(query)
            lat = None
            lng = None
            for q in query_parsed:
                if q[0] == 'center':
                    center = q[1]
                    lat = center.split(',')[0]
                    lng = center.split(',')[1]

            try:
                state = State.objects.get(name=state_text)
            except:
                state = State(name=state_text)
                state.save()

            try:
                city = City.objects.get(name=city_text)
            except:
                city = City(name=city_text, state=state)
                city.save()

            try:
                postcode = Postcode.objects.get(number=postcode_text)
            except:
                postcode = Postcode(number=postcode_text, state=state)
                postcode.save()

            try:
                hot_area = Hot_area.objects.get(name=hot_area)
            except:
                hot_area = hot_area(name=hot_area)
                hot_area.save()

            business = Business(
                name=name,
                name2=name2,
                phone=phone,
                street1=street1,
                city=city,
                postcode=postcode,
                latitude=lat,
                longitude=lng,
                hot_area=hot_area,
            )

            photo_url = soup.select('.showcase-photo-box img')[0]['src']
            ext = mimetypes.guess_extension(mimetypes.guess_type(photo_url)[0])
            req = urllib2.Request(photo_url, headers=headers)
            img_temp = NamedTemporaryFile(delete=True)
            img_temp.write(urllib2.urlopen(req).read())
            img_temp.flush()

            business.photo.save('%s%s' % (uuid.uuid4(), ext), File(img_temp))
            print '--- 确定吗? ---'
            print p.title, business.name, business.name2
            sure = raw_input('确定吗?')
            if sure in ['y', 'Y', 'yes', 'YES', 'Yes']:
                business.save()
                # tag
                tag = tag.split(' ')
                for t in tag:
                    try:
                        t_instance = Tag.objects.get(name=t)
                    except:
                        t_instance = Tag(name=t)
                        t_instance.save()
                    business.tag.add(t_instance)

                p.hide = True
                p.business = business
                p.save()
                print 'Succuess'
            else:
                print 'Fail'
Ejemplo n.º 11
0
#
# title: Scraping a section of webpage based on text
# url: https://stackoverflow.com/questions/67754320/scraping-a-section-of-webpage-based-on-text/67756231#67756231

import selenium.webdriver
from bs4 import BeautifulSoup as BS
import time

url = 'https://www.flashscore.com/football/chile/primera-division/'

driver = selenium.webdriver.Firefox()
driver.get(url)

time.sleep(5)

soup = BS(driver.page_source, 'html.parser')

print('--- version 1 ---')

section = soup.find('div', id='live-table').find('section')

for item in section.find_all('div', title='Click for match detail!'):
    print(item.get('id'))

print('--- version 2 ---')

section = soup.find('section', class_='event--live')

for item in section.find_all('div', title='Click for match detail!'):
    print(item.get('id'))
Ejemplo n.º 12
0
import requests
import re
from bs4 import BeautifulSoup as BS
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import sent_tokenize

#cleaning and extracting
url = 'http://shakespeare.mit.edu/allswell/full.html'
r = requests.get(url)
html = r.text

soup = BS(html, 'html5lib')
text = soup.get_text()
blockquote = soup.findAll('blockquote')

tokenizer = RegexpTokenizer('\w+[\']*\w*')
tokens = tokenizer.tokenize(text)

s = ''
for block in blockquote:
    try:
        lines = block.find_all('a')
        for line in lines:
            s += line.text + ' '

    except:
        continue

new = " ".join(s.split())

file = open("allswell.txt", 'w')
Ejemplo n.º 13
0
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as BS
from bs4 import SoupStrainer


dir_str = str(os.getcwd())
#directory = os.fsencode(dir_str)
poynt = 0
for file in os.listdir(dir_str):
    filename = os.fsdecode(file)
    if filename.endswith('.html'):
        os.chdir(dir_str)
        print(filename)
        with open(filename,'r') as doc:
            parse = BS(doc,'lxml')
            for tag in parse.find_all('div',class_='col-xs-12 col-md-8 col-lg-9'):
                n_rows = 0
                n_columns = 0
                point = 0
                for titles in tag.find_all('div',class_='heading'):
                    point += 1
                    if point == 1:
                        column_title = []
                        for columns in titles.find_all('div'):
                            column_title.append(str(columns.text.strip()))
                            n_columns += 1
                        column_title.append('Beer Styles')
                        column_title.insert(2,'Maximum ' + str(column_title[1]))
                        column_title[1] = 'Minimum ' + str(column_title[1])
                        #print(column_title)
Ejemplo n.º 14
0
import requests
from bs4 import BeautifulSoup as BS

url = 'https://producthunt.com'

r = requests.get(url)

html = r.text

soup = BS(html, 'html.parser')

for first in soup.find_all("main", id="app"):
    for second in first.find_all("div"):
        for third in second.find_all("div"):
            for a in first.find_all("div", class_="content_1mxGg"):
                for div in a.find_all("div", class_="legacyPosts_3AHFn"):
                    for ul in div.find_all("ul", class_="postsList_3n2Ck"):
                        for li in ul.find_all("li"):
                            for s in li.find_all("div",
                                                 class_="postItem_RepXj"):
                                for p in s.find_all("div",
                                                    class_="content_3Qj0y"):
                                    for q in p.find_all(
                                            "span",
                                            class_=
                                            "title_24w6f featured_2PenR default_25TkV base_3LqBu"
                                    ):
                                        print q.text
"""b=a.find_all(")[0]
l= b.find_all("ul",class_="postsList_3n2Ck")[0]
Ejemplo n.º 15
0
def appleipad(message):
    try:
        if message.text == 'ipad 7 10.2"':
            Rozetkaipad_7 = requests.get(
                'https://rozetka.com.ua/apple-ipad-7-10-2-a2197-wi-fi-32gb-2019-16127127/g22946813/'
            )
            Rozetkaipad_7 = Rozetkaipad_7.text
            Rozetkaipad_7 = BS(Rozetkaipad_7, 'html.parser')
            Rozetkaipad_7 = Rozetkaipad_7.find(
                'p', {'class': "product-prices__big"})
            Rozetkaipad_7 = list(Rozetkaipad_7)
            Rozetkaipad_7keyboard = types.InlineKeyboardMarkup()
            Rozetkaipad_7button = types.InlineKeyboardButton(
                text=RozetkaButtonText,
                url=
                'https://rozetka.com.ua/apple-ipad-7-10-2-a2197-wi-fi-32gb-2019-16127127/g22946813/'
            )
            Rozetkaipad_7keyboard.add(Rozetkaipad_7button)

            Fixeripad_7 = requests.get(
                'https://fixer.com.ua/pda/apple-a2197-ipad-10.2-wi-fi-32gb-space-grey-mw742rk-a_976052.html'
            )
            Fixeripad_7 = Fixeripad_7.text
            Fixeripad_7 = BS(Fixeripad_7, 'html.parser')
            Fixeripad_7 = Fixeripad_7.find('div', {'class': "pi_price"})
            Fixeripad_7 = str(Fixeripad_7)
            Fixeripad_7 = list(Fixeripad_7)
            Fixeripad_7keyboard = types.InlineKeyboardMarkup()
            Fixeripad_7button = types.InlineKeyboardButton(
                text=fixerbuttontext,
                url=
                'https://fixer.com.ua/pda/apple-a2197-ipad-10.2-wi-fi-32gb-space-grey-mw742rk-a_976052.html'
            )
            Fixeripad_7keyboard.add(Fixeripad_7button)

            Brainipad_7 = requests.get(
                'https://brain.com.ua/ukr/Planshet_Apple_A2197_iPad_102_Wi-Fi_32GB_Silver_MW752RK_A-p638525.html'
            )
            Brainipad_7 = Brainipad_7.text
            Brainipad_7 = BS(Brainipad_7, 'html.parser')
            Brainipad_7 = Brainipad_7.find('div', {'class': "br-pr-np"})
            Brainipad_7 = str(Brainipad_7)
            Brainipad_7 = list(Brainipad_7)
            Brainipad_7keyboard = types.InlineKeyboardMarkup()
            Brainipad_7button = types.InlineKeyboardButton(
                text=brainbuttontext,
                url=
                'https://brain.com.ua/ukr/Planshet_Apple_A2197_iPad_102_Wi-Fi_32GB_Silver_MW752RK_A-p638525.html'
            )
            Brainipad_7keyboard.add(Brainipad_7button)

            bot.send_message(message.chat.id,
                             'Rozetka - ' + Rozetkaipad_7[0] + ' ₴',
                             reply_markup=Rozetkaipad_7keyboard)
            bot.send_message(message.chat.id,
                             'Fixer - ' + Fixeripad_7[114] + Fixeripad_7[115] +
                             ' ' + Fixeripad_7[116] + Fixeripad_7[117] +
                             Fixeripad_7[118] + ' ₴',
                             reply_markup=Fixeripad_7keyboard)
            bot.send_message(message.chat.id,
                             'Brain - ' + Brainipad_7[139] + Brainipad_7[140] +
                             ' ' + Brainipad_7[141] + Brainipad_7[142] +
                             Brainipad_7[143] + ' ₴',
                             reply_markup=Brainipad_7keyboard)

    except:
        bot.send_message(message.chat.id, 'Произошла ошибка')
def getMatches(team = ''):

	url = "http://www.espncricinfo.com/ci/engine/match/index.html?view=live"
	r = requests.get(url)
	soup = BS(r.text, "html.parser")
	tableHeads = soup.find_all('div', {'class' : 'match-section-head'})
	tableData = soup.find_all('section', {'class' : 'matches-day-block'})
	team_matches = []
	
	if team == '':
		print "\nHere are the events going on live right now: "
		for ix in range(0, len(tableHeads)):
			print "\t" + str(ix+1) + ". " + str(tableHeads[ix].h2.text)

		try:
			ch = raw_input("\nChoose the event for which you wish to check out the matches (Enter 0 to See All; -1 to exit): ")
			ch = int(ch)
			if ch == -1:
				sys.exit("Hope you had fun. Have a great day ahead!")
			temp = tableData[ch - 1] or (ch == 0)
			
		except (IndexError, ValueError):
			print 'Please enter a valid integer between -1 and ' + str(len(tableData)) + '.'
			askForExit()
	else:
		ch = 0

	if ch > 0:
		matches = tableData[ch-1].find_all('section', {'class' : 'default-match-block'})
	
	else:
		matches = tableData[0].find_all('section', {'class' : 'default-match-block'})
		for ix in range(1, len(tableData)):
			matches = matches + tableData[ix].find_all('section', {'class':'default-match-block'})

	for ix in range(0,len(matches)):
		
		matchDetails = matches[ix].find_all('div')
		
		team1 = str(matchDetails[1].text.split('\n',1)[1].split(' ')[0])
		if len(str(matchDetails[1].text.split('\n',1)[1].split(' ')[1]))>0:
			team1 = team1 + " " + str(matchDetails[1].text.split('\n',1)[1].split(' ')[1])
		score1 = str(matchDetails[1].find('span').text)
		if len(str(matchDetails[1].text.split('\n',1)[1].split(' ')[2]))>0:
			team1 = team1 + " " + str(matchDetails[1].text.split('\n',1)[1].split(' ')[2])
		score2 = str(matchDetails[2].find('span').text)
		
		team2 = str(matchDetails[2].text.split('\n',1)[1].split(' ')[0])
		if len(str(matchDetails[2].text.split('\n',1)[1].split(' ')[1]))>0:
			team2 = team2 + " " + str(matchDetails[2].text.split('\n',1)[1].split(' ')[1])
		if len(str(matchDetails[2].text.split('\n',1)[1].split(' ')[2]))>0:
			team2 = team2 + " " + str(matchDetails[2].text.split('\n',1)[1].split(' ')[2])

		headerline = "Match " + str(ix+1) + ": " + team1 + " vs " + team2
		if len(headerline)<40:
			headerline += (" " * (40 - len(headerline)))
		
		if team in ['', team1.lower(), team2.lower()]:
			team_matches.append(ix+1)
			print "\n" + headerline + "\t\t(" + str(matchDetails[0].find('span', {'class':'bold'}).text) +")"
			print str(matchDetails[0].find('span', class_='match-no').a.text.split('     ',1)[1])
			print "\t" + team1 + ": " + score1 + "\n\t" + team2 + ": " + score2
			print "\n" + matchDetails[3].text.split('\n')[1]
			print "_"*50

	if len(team_matches) == 0 and team != '':
		print 'Sorry! No match found for team ' + team + '.'
		getMatches('')

	try:
		if len(team_matches) == 1:
			ch = team_matches[0]
		else:
			ch = raw_input("\nChoose the event for which you wish to see the whole scorecard (Enter -1 to Exit; 0 for previous menu): ")
			ch = int(ch)
			if ch == -1:
				sys.exit("Hope you had fun. Have a great day ahead!")
			if ch == 0:
				getMatches(' ')
			temp = matches[ch - 1]

	except (IndexError, ValueError):
		print 'Please enter a valid integer between -1 and ' + str(len(matches)) + '.'
		askForExit()

	url2 = "http://www.espncricinfo.com" + matches[ch-1].find_all('div')[4].find_all('a')[0]['href'] + "?view=scorecard"
	matchDetails = matches[ch-1].find_all('div')
	team1 = str(matchDetails[1].text.split('\n',1)[1].split(' ')[0])
	if len(str(matchDetails[1].text.split('\n',1)[1].split(' ')[1]))>0:
		team1 = team1 + " " + str(matchDetails[1].text.split('\n',1)[1].split(' ')[1])
	score1 = str(matchDetails[1].find('span').text)
	if len(str(matchDetails[1].text.split('\n',1)[1].split(' ')[2]))>0:
		team1 = team1 + " " + str(matchDetails[1].text.split('\n',1)[1].split(' ')[2])
	score2 = str(matchDetails[2].find('span').text)

	team2 = str(matchDetails[2].text.split('\n',1)[1].split(' ')[0])
	if len(str(matchDetails[2].text.split('\n',1)[1].split(' ')[1]))>0:
		team2 = team2 + " " + str(matchDetails[2].text.split('\n',1)[1].split(' ')[1])
	if len(str(matchDetails[2].text.split('\n',1)[1].split(' ')[2]))>0:
		team2 = team2 + " " + str(matchDetails[2].text.split('\n',1)[1].split(' ')[2])

	meta = "\t" + team1 + ": " + score1 + "\n\t" + team2 + ": " + score2
	meta += "\n\n" + matchDetails[3].text.split('\n')[1]
Ejemplo n.º 17
0
def IPhonxs(message):
    if message.text == 'Iphone Xs 64GB':
        try:
            RozetkaAppleiPhoneXs64GB = requests.get(
                'https://rozetka.com.ua/apple_iphone_xs_64gb_space_gray/p172930640/'
            )
            RozetkaAppleiPhoneXs64GB = (RozetkaAppleiPhoneXs64GB.text)
            RozetkaAppleiPhoneXs64GB = BS(RozetkaAppleiPhoneXs64GB,
                                          'html.parser')
            RozetkaAppleiPhoneXs64GB = RozetkaAppleiPhoneXs64GB.find(
                'p',
                {'class': 'product-prices__big product-prices__big_color_red'})
            RozetkaAppleiPhoneXs64GB = list(RozetkaAppleiPhoneXs64GB)
            RozetkaAppleiPhoneXs64GB = RozetkaAppleiPhoneXs64GB[0]
            RozetkaAppleiPhoneXs64GBkeyboard = types.InlineKeyboardMarkup()
            RozetkaAppleiPhoneXs64GBbutton = types.InlineKeyboardButton(
                text=RozetkaButtonText,
                url=
                'https://rozetka.com.ua/apple_iphone_xs_64gb_space_gray/p172930640/'
            )
            RozetkaAppleiPhoneXs64GBkeyboard.add(
                RozetkaAppleiPhoneXs64GBbutton)

            FokstrotAppleiPhoneXs64GB = requests.get(
                'https://www.foxtrot.com.ua/ru/shop/mobilnye_telefony_apple_iphone-xs-64gb-silver.html?utm_medium=cpc&utm_source=hotline&utm_campaign=DIG-mobilephohe&utm_term=mobilnye_telefony_apple_iphone-xs-64gb-silver&utm_content=6423074'
            )
            FokstrotAppleiPhoneXs64GB = (FokstrotAppleiPhoneXs64GB.text)
            FokstrotAppleiPhoneXs64GB = BS(FokstrotAppleiPhoneXs64GB,
                                           'html.parser')
            FokstrotAppleiPhoneXs64GB = FokstrotAppleiPhoneXs64GB.find(
                'div', {'class': "card-price"})
            FokstrotAppleiPhoneXs64GB = list(FokstrotAppleiPhoneXs64GB)
            FokstrotAppleiPhoneXs64GB = FokstrotAppleiPhoneXs64GB[0]
            FokstrotPhoneXs64GBkeyboard = types.InlineKeyboardMarkup()
            FokstrotPhoneXs64GBbutton = types.InlineKeyboardButton(
                text=FokstrotButtonText,
                url=
                'https://www.foxtrot.com.ua/ru/shop/mobilnye_telefony_apple_iphone-xs-64gb-silver.html?utm_medium=cpc&utm_source=hotline&utm_campaign=DIG-mobilephohe&utm_term=mobilnye_telefony_apple_iphone-xs-64gb-silver&utm_content=6423074'
            )
            FokstrotPhoneXs64GBkeyboard.add(FokstrotPhoneXs64GBbutton)

            buyAppleiPhoneXs64GB = requests.get(
                'http://www.buy.ua/shop/1400215/1400417/1712920.html')
            buyAppleiPhoneXs64GB = (buyAppleiPhoneXs64GB.text)
            buyAppleiPhoneXs64GB = BS(buyAppleiPhoneXs64GB, 'html.parser')
            buyAppleiPhoneXs64GB = buyAppleiPhoneXs64GB.find(
                'div', {'class': "price-info"})
            buyAppleiPhoneXs64GB = str(buyAppleiPhoneXs64GB)
            buyAppleiPhoneXs64GB = list(buyAppleiPhoneXs64GB)
            buyPhoneXs64GBkeyboard = types.InlineKeyboardMarkup()
            buyPhoneXs64GBbutton = types.InlineKeyboardButton(
                text=buyButtonText,
                url='http://www.buy.ua/shop/1400215/1400417/1712920.html')
            buyPhoneXs64GBkeyboard.add(buyPhoneXs64GBbutton)

            bot.send_message(message.chat.id,
                             'Rozetka - ' + RozetkaAppleiPhoneXs64GB + ' ₴',
                             reply_markup=RozetkaAppleiPhoneXs64GBkeyboard)
            bot.send_message(message.chat.id,
                             'Фокстрот - ' + FokstrotAppleiPhoneXs64GB + ' ₴',
                             reply_markup=FokstrotPhoneXs64GBkeyboard)
            bot.send_message(message.chat.id,
                             'Buy.ua - ' + str(buyAppleiPhoneXs64GB[55]) +
                             str(buyAppleiPhoneXs64GB[56]) + ' ' +
                             str(buyAppleiPhoneXs64GB[57]) +
                             str(buyAppleiPhoneXs64GB[58]) +
                             str(buyAppleiPhoneXs64GB[59]) + ' ₴',
                             reply_markup=buyPhoneXs64GBkeyboard)

        except:
            bot.send_message(message.chat.id, 'Произошла ошибка')
Ejemplo n.º 18
0
        combinedDF['highestWord'].values[i] = individualWordsResult[
            'highestWord']
        combinedDF['highestWordCount'].values[i] = individualWordsResult[
            'highestWordCount']
        combinedDF['lowestWord'].values[i] = individualWordsResult[
            'lowestWord']
        combinedDF['lowestWordCount'].values[i] = individualWordsResult[
            'lowestWordCount']

        if result > 0:
            # it exists (less common case), now find out where

            print("Checking for [" + needle + "] in DOM tags of " + haystack)

            soup = BS(lowerpagetext)

            combinedDF['KeywordFoundinHTags'].values[i] = find_by_text(
                soup, needle, 'h1') + find_by_text(
                    soup, needle, 'h2') + find_by_text(soup, needle, 'h3')

            combinedDF['KeywordFoundinTitle'].values[
                i] = soup.title.string.lower().count(lowerneedle)

    with pd.ExcelWriter(name + '.xlsx') as writer:
        combinedDF.to_excel(writer, sheet_name='data')

        print("finished and outputed to excel file")
else:
    print("nothing found")
Ejemplo n.º 19
0
 def futureQuote(self):
     self.driverf.get('http://info512.taifex.com.tw/Future/FusaQuote_Norl.aspx')
     time.sleep(0.1)
     soup = BS(self.driverf.page_source,'lxml')
     self.future_table = pd.read_html(str(soup.select('#divDG')[0]),header=0)[0]
def main(handle, code_list_file, code_path):
    with open(code_list_file, "r", encoding="utf-8") as f:
        codeList = json.load(f)

    num = 1
    count = 100
    while (True):
        conn = get_response("https://codeforces.com/api/user.status?",
                            f"handle={handle}&from={num}&count={count}")
        data = conn.read()
        conn.close()

        res = json.loads(data.decode("utf-8"))
        print(num)

        if res["status"] == "OK":
            results = res["result"]

            if results == []:
                break

            for result in results:
                if result['verdict'] != "OK":
                    continue
                contestId = result['problem']['contestId']
                contestIdx = result['problem']['index']
                solutionId = result['id']
                language = result['programmingLanguage']

                fName = str(contestId) + str(contestIdx)
                if fName not in codeList:
                    codeList[fName] = [solutionId]
                else:
                    if solutionId in codeList[fName]:
                        continue
                    else:
                        codeList[fName].append(solutionId)
                        fName += "_" + str(len(codeList[fName]))

                solutionApi = f'https://codeforces.com/contest/{contestId}/submission/{solutionId}'

                conn = get_response(solutionApi)
                bs_res = BS(conn, 'html.parser')
                conn.close()

                code = bs_res.find('pre').text.strip()
                code = re.sub("\r\n", "\n", code)

                fName += file_type[language]
                remark = make_remark(
                    result['problem']['name'], handle,
                    datetime.datetime.fromtimestamp(
                        result['creationTimeSeconds']).isoformat(), language)
                code = remark + code

                try:
                    with open(code_path + fName, "w", encoding="utf-8") as f:
                        f.write(code)
                except:
                    print(code_path + "가 존재하지 않거나 파일이 열려있는 상태입니다.")
                    sys.exit(1)

                time.sleep(random.uniform(1, 2))
        num += count

    try:
        with open(code_list_file, "w", encoding="utf-8") as f:
            json.dump(codeList, f)
    except:
        print(code_list_file + "를 다시 작성중 오류가 발생했습니다.")
        sys.exit(1)
Ejemplo n.º 21
0
"""
Code Challenge 4

Scrap the data from the URL below and store in sqlite database

https://www.icc-cricket.com/rankings/mens/team-rankings/odi

"""

from bs4 import BeautifulSoup as BS
import requests

url = "https://www.icc-cricket.com/rankings/mens/team-rankings/odi"
source = requests.get(url).text

soup = BS(source, 'lxml')
print(soup.prettify())

right_table = soup.find('table', class_='table')
print(right_table.prettify())

A = []
B = []
C = []
D = []
E = []

for i in right_table.findAll('tr'):
    rows = i.findAll('td')
    if len(rows) == 5:
        A.append(rows[0].text.strip())
def make_request(url):
    page = requests.get(url)
    soup = BS(page.text, 'html.parser')
    return soup
Ejemplo n.º 23
0
"""
Author: https://github.com/Romawechka
Python version 3.8.5
"""

import requests
from bs4 import BeautifulSoup as BS

if __name__ == "__main__":
    req = requests.get('http://www.cbr.ru/scripts/XML_daily.asp')
    xml = BS(req.content, 'lxml')

    for el in xml.findAll('valute'):

        Gon_dollar = str(el.find('name')).replace('/',
                                                  '').replace('<name>', '')

        if Gon_dollar == 'Гонконгских долларов':
            value = str(el.find('value')).replace('/',
                                                  '').replace('<value>', '')
            char = str(el.find('charcode')).replace('/', '').replace(
                '<charcode>', '')

            print(f'1 {char} = {value} Rub')
            break

    inp = input(
        '\nIf you want to exit enter any message or just close the application.\n'
    )
    exit()
utterance will begin with NO NON-DOM. If there is non-dominant hand gloss in the utterance there will be **NON-DOM** followed by the non-dominant hand 
gloss."""

from bs4 import BeautifulSoup as BS
import re

partial_path = """<write the path to "ncslgr-xml">"""  # Write the path location where ncslgr-xml is saved on your local machine

dominant_only_gloss = ()
dominant_and_non_dominant_gloss = ()

with open(partial_path + r'\close call.xml', 'r') as f_IN:
    with open(
            """Path name to file output""", 'a'
    ) as f_OUT_utts:  # Write path to the file name you want to use to save the output to
        soup = BS(f_IN.read(), 'xml')

        for utterance_tag in soup.find_all('UTTERANCES'):
            for utterance_tags in utterance_tag.find_all('UTTERANCE'):
                if utterance_tags.find_all('TRACK', {'FID': '10001'}):
                    for dominant_track_tags in utterance_tags.find_all(
                            'TRACK', {'FID': '10000'}):
                        for dominant_a_tags in dominant_track_tags.find_all(
                                'A'):
                            if dominant_a_tags.has_attr('VID'):
                                dominant_a_tags.decompose()
                        for non_dominant_track_tags in utterance_tags.find_all(
                                'TRACK', {'FID': '10001'}):
                            for non_dominant_a_tags in non_dominant_track_tags.find_all(
                                    'A'):
                                if non_dominant_a_tags.has_attr('VID'):
Ejemplo n.º 25
0
def price_guide(item, max_cost_quantile=None):
    """Fetch pricing info for an item"""
    results = []

    if (item['ItemTypeID'] == 'P' and 'stk0' in item['ItemID']) or \
        item['ItemTypeID'] == 'S' or \
        item['ItemTypeID'] == 'M':
        # a sticker sheet, a set, or a minifigure
        color_ids = [0]
    else:
        # a normal item
        color_ids = color.similar_to(item['ColorID'])

    for c in color_ids:
        # perform HTTP request
        parameters = {
            'itemType': item['ItemTypeID'],
            'itemNo': item['ItemID'],
            'itemSeq': 1,
            'colorId': c,
            'v': 'P',
            'priceGroup': 'Y',
            'prDec': 2
        }
        url = "http://www.bricklink.com/catalogPG.asp?" + urllib.urlencode(
            parameters)
        html = urllib.urlopen(url).read()

        # parse page
        page = BS(html)

        if len(page.find_all(text='Currently Available')) == 0:
            # not available in this color :(
            continue
        else:

            # newly found inventory
            new = []

            for td in page.find_all('td'):
                if td.find('a', recursive=False,
                           href=re.compile('/store.asp')) is not None:
                    # find the td element with a link to a store. Its siblings contain
                    # the interesting bits like price and quantity available
                    store_url = td.find('a')['href']
                    store_id = int(utils.get_params(store_url)['sID'])
                    quantity = int(td.next_sibling.text)
                    cost_per_unit = float(
                        re.findall('[0-9.]+',
                                   td.next_sibling.next_sibling.text)[0])

                    new.append({
                        'item_id': item['ItemID'],
                        'wanted_color_id': item['ColorID'],
                        'color_id': c,
                        'store_id': store_id,
                        'quantity_available': quantity,
                        'cost_per_unit': cost_per_unit
                    })

            # remove items that cost too much
            if max_cost_quantile is not None and max_cost_quantile < 1.0:
                observed_prices = [
                    e['quantity_available'] * [e['cost_per_unit']] for e in new
                ]
                observed_prices = list(sorted(utils.flatten(observed_prices)))
                if len(observed_prices) > 0:
                    i = utils.quantile(
                        len(observed_prices) - 1, max_cost_quantile)
                    max_price = observed_prices[i]
                    new = filter(lambda x: x['cost_per_unit'] <= max_price,
                                 new)

            # add what's left to the considered inventory
            results.extend(new)

        if sum(e['quantity_available'] for e in results) >= item['Qty']:
            # stop early, we've got everything we need
            return results

    return results
Ejemplo n.º 26
0
##used regular expressions to remove the tags from each line
def removeTag(raw_text):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_text)
    return cleantext


##Pull in the scoreboard and parse through it
try:
    page = requests.get('http://scoreboard.uscyberpatriot.org')
except:
    print("[!]Error: Webpage is unavailable...")
    sys.exit()

html = BS(page.content, 'html.parser')

##Set up for the Excel file
book = xlwt.Workbook()
sheetName = str(input("What round of competition is it?(ex. round1): "))
sheet = book.add_sheet(sheetName)

cols = [
    "Placement", "Team Number", "Location", "Division", "Teir",
    "Scored Images", "Play Time", "Current Score"
]
row = sheet.row(0)
for index, col in enumerate(cols):
    row.write(index, col)

print("~" * 15 + "Starting program" + "~" * 15)
Ejemplo n.º 27
0
import urllib2
from bs4 import BeautifulSoup as BS
import json

link = 'https://www.cbssports.com/nfl/stats/playersort/' \
       'nfl/year-2017-season-regular-category-touchdowns'
data = urllib2.urlopen(link)
soup = BS(data.read(), "html.parser")


def FStat():
    data_list = []
    fhandler = soup.find_all(class_={'row1', 'row2'})
    for tdata in fhandler[:20]:
        try:
            player_name = tdata.contents[0].get_text()
            player_position = tdata.contents[1].get_text()
            player_team = tdata.contents[2].get_text()
            touch_downs = tdata.contents[6].get_text()

            # defining json strings
            json_string = {
                "Name": player_name,
                "Position": player_position,
                'Team': player_team,
                "Touchdowns": touch_downs
            }
            print json.dumps(json_string)

        except:
            print 'Please double check the data, something went wrong'
Ejemplo n.º 28
0
import requests
from bs4 import BeautifulSoup as BS
import re

session = requests.session()
page = session.post('https://yts.ag/')

#Home Page details

#Popular Movies

popular_movies = BS(page.text,"html.parser").find('div',{'id':'popular-downloads'})
##print(popular_movies)
popular_movies_rating = popular_movies.find_all('h4',{'class':'rating'})
popular_movies_torrents = popular_movies.find_all('a',{'rel':'nofollow'})

tempList = popular_movies.find_all('div',{'class':'browse-movie-tags'})
tempList_torrents =[]

for i in tempList:
    torrents = i.find_all('a',{'rel':'nofollow'})
    if (len(torrents) == 2):
        tempList_torrents.append({'720p':(re.search('(.+)',torrents[0]['href']).group()),'1080p':(re.search('(.+)',torrents[1]['href']).group())})
    else :
        tempList_torrents.append({'720p':(re.search('(.+)',torrents[0]['href']).group()),'1080p':''})

popular_movies = popular_movies.find_all('a',{'class':'browse-movie-title'})

##print(popular_movies)
##print('\n')
Ejemplo n.º 29
0
        sql = "INSERT INTO scrapydb.cleantriple(subject, subject_alternative, predicate, object,repository) VALUES(%s,%s,%s,%s,%s)"
        values = (s, sa, p, o, repository)
        myac.execute(sql, values)
        mydb.commit()


#------------------- TO get Row from DB ---------------------
source = 'Skill Commons'

myac.execute(
    'select * from triplesSC where predicate = "hasRaw" and process = 0;')
rows = myac.fetchall()
nRows = len(rows)
for r in rows:
    subjectOer = 'oer'
    metadata_tb = BS(r[3], 'html.parser')

    #title
    titleTB = metadata_tb.find('h1').get_text()
    subjectOer = r[6]
    saveTriple(subjectOer, '', 'title', titleTB, source)

    #authors
    try:
        domAuthors = metadata_tb.find('ul', {
            'class': 'authors'
        }).find_all('li')
        for li in domAuthors:
            author = li.get_text()
            saveTriple(subjectOer, '', 'author', author, source)
    except Exception as e:
Ejemplo n.º 30
0
from bs4 import BeautifulSoup as BS
from pycldf import StructureDataset, Source
import json

# load languages
with open('raw/languages.json') as f:
    langs = json.load(f)

soup = BS(open('raw/42 Chinese dialects.kml').read(), 'xml')

formtable, languagetable, parametertable = [], [], []
for p in soup.findAll('Placemark'):
    name = ' '.join([c.contents[0] for c in p.findAll('name')])
    idx = ' '.join([c.contents[0] for c in p.findAll('description')])
    if name in langs:
        gcode = langs[name]['glottolog']
    else:
        gcode = ''
    coords = p.findAll('coordinates')[0].contents[0].replace('\n', '')
    if len(coords.split(',')) == 3:
        lon, lat, _ = coords.split(',')
        languagetable += [{
            'ID': idx.strip(),
            'Name': name.strip(),
            'Glottocode': gcode,
            'Latitude': lat.strip(),
            'Longitude': lon.strip(),
        }]

with open('raw/Parameters.tsv') as f:
    tmp = f.readlines()