def scrape(url, data_list): uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") # Finding each events div = page_soup.findAll('div', {"class": "wrapBlock wrapBlockNews"}) for container in div: # container = div[0] title = container.h2.text date = container.em.text.strip('()') date = date.split('/') dateArr = date month = date[1] # date format change month = datetime.datetime.strptime(month, '%m').strftime('%B') #date = date[0] + (' ') + month + (' ') + date[2] date = date[2] + '' + month + '' + date[0] d1 = datetime.datetime(int(dateArr[2]), int(month_string_to_number(month)), int(dateArr[0])) d2 = datetime.datetime.now() #wh = container.p.text #wh = str(wh) location = 'Whitehall College of Further Education, Drumcondra, Dublin' ordinates = getOrdinates(location) read_more = container.a['href'] Category = 'EDUCATION, BUSINESS & TECHNOLOGY' img = 'https://whitehallcollege.com/uploads/pages/logo.jpg' if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.startdate = date data.enddate = '' data.time = '' data.category = Category data.price = '' data.summary = '' data.location = location data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.img = img data.read_more = read_more data_list.append(data) print(len(data_list)) return data_list
def scrape(url, data_list): uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_html = soup(page_html, "lxml") # Finding each events article = page_html.find_all('div', class_='news_snippet') for container in article: # container = article[0] title = container.h3.text.strip() p_tags = container.findAll('p') olddate = p_tags[0].text.strip() #date formatting datesplit = olddate.split('/') date = datesplit[0] month = datesplit[1] monthTemp = month month = datetime.datetime.strptime(month, '%m').strftime('%B') year = datesplit[2].split(' ') year = year[0] date = date + ' ' + month + ' ' + year d1 = datetime.datetime(int(year), int(monthTemp), int(date)) summary = p_tags[1].text.strip() read_more = container.a['href'] read_more = 'https://www.itb.ie/NewsEvents/' + read_more location = "Technological University Dublin, Blanchardstown, Dublin 15" Category = 'EDUCATION, BUSINESS & TECHNOLOGY' image = 'https://uindia.net/assets/img/MediaTechnology.jpg' if d1 > datetime.datetime.now(): data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.img = image data.startdate = date data.enddate = ' ' data.price = ' ' data.summary = summary data.time = '' data.location = location data.read_more = read_more data.category = Category data_list.append(data) print(len(data_list)) return data_list
def scrape(url,data_list): uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_html = soup(page_html, "html.parser") # Finding each events article = page_html.find_all('div',{"class":"article-list__col col-sm-6 col-md-3"}) for arti in article: read_more = "https://tudublin.ie"+ article[0].a["href"] date = arti.find_all('p')[0].text.strip() new_date = date.split(',')[0].split(' ')[0] month = date.split(',')[0].split(' ')[1] year = date.split(',')[1].strip() monthTemp = month month = datetime.datetime.strptime(month,'%b').strftime('%B') date = new_date+" "+month+" "+year d1 = datetime.datetime(int(year),int(month_string_to_number(monthTemp)),int(new_date)) category = "EDUCATION, BUSINESS & TECHNOLOGY" location = arti.find_all('li',{"class":"article-list__location"})[0].text.strip() desc = arti.find_all('p')[1].text.strip() title = arti.h3.text image = "https://tudublin.ie" + arti.img["src"] time = arti.find_all('li',{"class":"article-list__time"})[0].text.strip() if d1>datetime.datetime.now(): data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.img = image data.startdate = date data.enddate = ' ' data.price = ' ' data.summary = desc data.time = time data.location = location data.read_more = read_more data.category = category data_list.append(data) print(len(data_list)) return data_list
def scrape(urlOriginal, data_list): i = 0 #3 for value in range(1, 3): url = "" url = urlOriginal + format(value) print(url) try: uClient = uReq(url) except: pass page_html = uClient.read() uClient.close() #Parsing page_soup = soup(page_html, "html.parser") #article = page_soup.findAll('ul',class_='search-main-content__events-list') article_1 = page_soup.findAll('div', class_='search-event-card-wrapper') # fetching each details for container in article_1: title = container.findAll( 'div', class_='eds-event-card__formatted-name--is-clamped' )[0].text try: Date_time = container.findAll( 'div', class_= 'eds-text-color--primary-brand eds-l-pad-bot-1 eds-text-weight--heavy eds-text-bs' )[0].text except: Date_time = 'None' # try: # Location = container.findAll('div',class_='card-text--truncated__one')[0].text # except: # Location='None' try: Price = container.findAll( 'div', class_= 'eds-media-card-content__sub eds-text-bm eds-text-color--grey-600 eds-l-mar-top-1 eds-media-card-content__sub--cropped' )[1].text except: Price = 'None' a_tags = container.findAll('a') try: image = a_tags[0].img['src'] except: image = 'None' read_more = a_tags[0]['href'] print(read_more) category = 'COMMUNITY & FESTIVALS' if category == 'COMMUNITY & FESTIVALS' and image == 'None': image = 'https://www.totallydublin.ie/wp-content/uploads/2017/07/Just-Eat-Street.jpg' # description descurl = read_more #Opening connection , grabbing the page try: uClient = uReq(descurl) except: pass desc_html = uClient.read() uClient.close() #Parsing desc_soup = soup(desc_html, "html.parser") desc = desc_soup.findAll( 'div', class_='js-xd-read-more-contents l-mar-top-3' ) or desc_soup.findAll( 'div', class_= 'structured-content-rich-text structured-content__module l-align-left l-mar-vert-6 l-sm-mar-vert-4 text-body-medium' ) if len(desc) > 0: try: p_tags = desc[0].findAll('p') except: continue descrip = [] for i in range(len(p_tags)): descript = p_tags[i].text descrip.append(descript) description = ''.join(str(e) for e in descrip) else: description = 'None' # Date fetching and formatting time = desc_soup.findAll('time', class_='clrfix') if len(time) > 0: time_tags = time[0].findAll('p') date_check = time_tags[0].text if date_check == 'Multiple Dates' or date_check == 'Multiple Dates GMT' or date_check == 'Multiple Dates IST': Final_Date = date_check else: Date_time = date_check.split(',') if (len(Date_time)) == 2: Final_Date = Date_time[1].strip(' ') else: Mon_Date = Date_time[1].split(' ') if len(Mon_Date) == 3: Date = Mon_Date[2] month = Mon_Date[1] if len(month) <= 3: Month = datetime.datetime.strptime( month, '%b').strftime('%B') else: Month = month year = Date_time[2] Final_Date = Date + (' ') + Month + year elif len(Mon_Date) == 4: Date = Mon_Date[1] month = Mon_Date[2] Month = datetime.datetime.strptime( month, '%b').strftime('%B') year = Mon_Date[3] Final_Date = Date + (' ') + Month + ( ' ') + year else: Final_Date = 'None' #location fetching location_div = desc_soup.findAll('div', class_='event-details__data') if len(location_div) > 0: location_tags = location_div[1].findAll('p') locat = location_tags[0].text location = locat + (' ') + "Dublin" else: location = 'Dublin' print(location) try: if location == 'Dublin': ordinates[2] = "The Spire,North City,Dublin" ordinates[0] = 53.3498091 ordinates[1] = -6.2602548 else: ordinates = getOrdinates(location) except: continue try: d1 = datetime.datetime(int(year), int(month_string_to_number(Month)), int(Date)) except: continue d2 = datetime.datetime.now() if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = '' data.location = location data.summary = description data.img = image data.category = category data.address = ordinates[2] data.startdate = Final_Date data.read_more = read_more data.enddate = '' data.price = Price data.latitude = ordinates[0] data.longitude = ordinates[1] data_list.append(data) i = i + 1 # print(len(data)) print(len(data_list)) return data_list
def scrape(url,data_list): uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") # Finding each events div = page_soup.findAll('div',{"class":"grid-item"}) for container in div: try: url=container.a['href'] except: url='' try: image=container.a.img['src'] except: image='None' title = container.h3.text date=container.h2.text #date formatting datesplit = date.split('/') date_month_year = datesplit[0].split('.') date = date_month_year[0].split(' ') date = date[1] dateTemp = date month = date_month_year[1] monthTemp = month month = datetime.datetime.strptime(month,'%m').strftime('%B') year = date_month_year[2] date = date + ' '+ month+' '+ year d1 = datetime.datetime(int(year), int(monthTemp), int(dateTemp)) d2 = datetime.datetime.now() div_tags = container.findAll('div') time = datesplit[1] div_tags = container.findAll('div') price = div_tags[1].p.text location = "Sugar Club, Leeson Street, Dublin" ordinates = getOrdinates(location) category = "MUSIC & ENTERTAINMENT" if category == 'MUSIC & ENTERTAINMENT' and image == 'None': image = 'https://livestyle.com/wp-content/uploads/2017/07/slider-4.jpg' if d1>d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = time data.location = location data.summary = '' data.img = image data.category = category data.startdate = date data.read_more = url data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = '' data.price = price data_list.append(data) print(len(data_list)) return data_list
def scrape(url,data_list): uClient = uReq(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") # Finding each events article = page_soup.find_all('article', class_="event card") for container in article: # container = article[0] div = container.find_all('div', class_="text") v = div[0].a["href"] img = container.div["style"] pattern = r"(?<=\(')[^'\)]*" img = re.search(pattern, img) img = img[0] category = container["data-categories"] if category=='ART & THEATRE': category='FASHION, ART & THEATRE' elif category=='BUSINESS & TECH': category='EDUCATION, BUSINESS & TECHNOLOGY' elif category=='FAMILY FRIENDLY': category='COMMUNITY & FESTIVALS' elif category=='FESTIVALS': category='COMMUNITY & FESTIVALS' elif category=='FILM & LITERATURE': category='FASHION, ART & THEATRE' elif category=='FOOD & DRINK': category='FOOD & DRINK' elif category=='FREE': category='FREE' elif category=='LEARNING': category='EDUCATION, BUSINESS & TECHNOLOGY' elif category=='MUSIC & COMEDY': category='MUSIC & ENTERTAINMENT' elif category=='SPORTS': category='SPORTS & HEALTH' url2 = v uClient = uReq(url2) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") div = page_soup.find_all('div', class_="event-info") details = div[0].findAll('dd') datadict = {} var1 = div[0].findAll('dt') for i in range(len(var1)): datadict[var1[i].text] = details[i].text date = datadict['Date:'] time = datadict['Time:'] if any('Price:' in x for x in datadict): price = datadict['Price:'] else: continue #address = datadict['Address:'] var2 = date.split('-') startdate = var2[0] startdate = startdate.split(' ') start = startdate[1].replace('th','').replace('nd','').replace('st','').replace('rd','') date = start.split(' ') date = start month = startdate[2] year = datetime.now().year startdate = date + ' '+ month+' '+ year.__str__() d1 = datetime(year,int(month_string_to_number(month)),int(date)) try: enddate = var2[1] enddateArr = enddate.split(' ') end=enddateArr[1].replace('th','').replace('nd','').replace('st','').replace('rd','') d1 = datetime(year, int(month_string_to_number(enddateArr[2])), int(end)) except: enddate = 'None' title = page_soup.h1.text #time = container.time.text p_tags = container.findAll('p') location = p_tags[0].text summary = p_tags[1].text.strip() read_more = url2 if d1>datetime.now(): data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = time data.location = location data.summary = summary data.img = img data.category = category data.startdate = startdate data.read_more = read_more #data.address = address data.enddate = enddate data.price = price data_list.append(data) print(len(data_list)) return data_list
def scrape(urlOriginal, data_list): data_list = [] #136 #100--added for value in range(1, 136): url = "" url = urlOriginal + format(value) + '/' print(url) try: uClient = uReq(url) except: pass page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") det = page_soup.findAll('div', class_='details') for container in det: try: title = container.h2.text except: title = 'None' location = container.h3.text try: description = container.p.text.strip('\n') description = description.strip(' ') except: description = 'None' try: img = container.img['src'] except: img = 'None' read_more = container.a['href'] read = 'https://www.ireland.com/' + read_more category = 'TOURISM & SIGHTSEEING' if category == 'TOURISM & SIGHTSEEING' and img == 'None': img = 'https://www.fhi.no/globalassets/bilder/vaksine/oversikt-reisevaksine.jpg?preset=mainbodywidth' print(location) if location == 'Dublin': ordinates[2] = "The Spire,North City,Dublin" ordinates[0] = 53.3498091 ordinates[1] = -6.2602548 else: ordinates = getOrdinates(location) data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = '' data.location = location data.summary = description data.img = img data.category = category data.startdate = '' data.read_more = read data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = '' data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(urlOriginal, data_list): #4 for value in range(1, 4): url = "" url = urlOriginal + format(value) print(url) uClient = uReq(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") div = page_soup.find_all('li', {"class": "wrapped wrapped-borderless"}) for container in div: Title = container.h3.text.strip() URL = 'https://www.rcsi.com' + container.a['href'] date = container.div.text.replace("\n", " ") # date formatting date_split = date.split(' ') date = date_split[1] if "-" in date: datecheck = date.split('-') date = datecheck[0] enddate1 = datecheck[1] month = date_split[2] month = datetime.datetime.strptime(month, '%b').strftime('%B') year = datetime.datetime.now().year date1 = date + ' ' + month + ' ' + year.__str__() d1 = datetime.datetime(int(year), int(month_string_to_number(month)), int(date)) try: enddate = enddate1 + ' ' + month + ' ' + year.__str__() d1 = datetime.datetime(int(year), int(month_string_to_number(month)), int(enddate1)) except: enddate = 'None' d2 = datetime.datetime.now() Location = container.p.text.strip('\n') try: if Location == 'Dublin': ordinates[2] = "The Spire,North City,Dublin" ordinates[0] = 53.3498091 ordinates[1] = -6.2602548 else: ordinates = getOrdinates(Location) except: continue ordinates = getOrdinates(Location) if str(ordinates) == 'Dublin': ordinates = getOrdinates("Dublin") Category = 'EDUCATION, BUSINESS & TECHNOLOGY' img = 'http://www.hrbcentreprimarycare.ie/images/rcsilogonewer.png' if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = Title data.time = '' data.location = Location data.summary = '' data.img = img data.category = Category data.startdate = date1 data.read_more = URL data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = enddate data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(url, data_list): uClient = uReq(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") article = page_soup.find_all('div', {"class": "c-feed-box-outer"}) for arti in article: url2 = 'https://portal.dublinchamberhosting.com' + arti.a["href"] #Opening connection to second page, grabbing the page uClient = uReq(url2) page_html = uClient.read() uClient.close() #Parsing2 page_soup = soup(page_html, "html.parser") #print(page_soup) try: image = page_soup.find_all( 'div', { "class": "c-banner_background-image o-cropcontent o-crop_content--center" })[0].img["src"] except: image = 'https://uindia.net/assets/img/MediaTechnology.jpg' desc = page_soup.find_all( 'div', {"class": "description"})[0].p.text.strip() try: desc = desc.span.text.strip() except: desc = desc event_box = page_soup.find_all( 'div', {"class": "c-event-booking__right-column"})[0] event_box = event_box.find_all( 'div', {"class": "c-event_event-info"})[0].find_all( 'div', {"class": "c-event-info_value"}) title = event_box[0].text.strip() date = event_box[1].text.strip() #Date formatting newdate = date.split(' ') date = newdate[1] month = newdate[2] month = datetime.datetime.strptime(month, '%b').strftime('%B') year = newdate[3] date = date + ' ' + month + ' ' + year time = event_box[2].text.strip() address = event_box[3].text.strip() category = "EDUCATION, BUSINESS & TECHNOLOGY" d1 = datetime.datetime(int(year), int(newdate[2]), int(date)) d2 = datetime.datetime.now() if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = time data.location = address data.summary = desc data.img = image data.category = category data.startdate = date data.read_more = url2 #data.address = address data.enddate = '' data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(urlOriginal, data_list): i = 0 for value in range(1, 67): url = "" url = urlOriginal + format(value) print(url) #JSONContent = requests.get(url).json() uh = uReq(url) data = uh.read() print('Retrieved', len(data), 'characters') JSONContent = json.loads(data.decode("utf-8")) content = json.dumps(JSONContent, indent=4, sort_keys=True) # print(content) data = json.loads(content) # location and categories # replacing category_id with their corresponding category name # scraping category id and name url1 = "https://www.eventbriteapi.com/v3/categories/?token=4KFS7BDPSZ5A5KWQ62KZ" catJSONContent = requests.get(url1).json() catcontent = json.dumps(catJSONContent, indent=4, sort_keys=True) # print(content) categorylist = json.loads(catcontent) category_list = [] for categories in categorylist['categories']: name = categories['name'] id_cat = categories['id'] categoryy = name, id_cat category_list.append(categoryy) category_list = list(category_list) for events in data['events']: name = events['name']['text'] description = events['description']['text'] description = format(description) link = events['url'] start_datetime = events['start']['local'] # formating start date and time start_date_split = start_datetime.split('T') start_date = start_date_split[0] start_time = start_date_split[1] start_date = start_date.split('-') date = start_date[2] month = start_date[1] year = start_date[0] month = datetime.datetime.strptime(month, '%m').strftime('%B') start_date = date + ' ' + month + ' ' + year # end_date end_datetime = events['end']['local'] # formating end date and time end_date_split = end_datetime.split('T') end_date = end_date_split[0] end_time = end_date_split[1] time = start_time + ('-') + end_time end_date = end_date.split('-') date = end_date[2] month = end_date[1] monthTemp = end_date[1] year = end_date[0] month = datetime.datetime.strptime(month, '%m').strftime('%B') end_date = date + (' ') + month + (' ') + year d1 = datetime.datetime(int(year), int(monthTemp), int(date)) # event price free_event = events['is_free'] if free_event == True: price = 'free' else: price = 'check link for more details' category_id = events['category_id'] # replacing category_id with category name for each in category_list: if category_id in each[1]: category = each[0] # Category Uniformication if category == 'Auto, Boat & Air' or category == 'Health & Wellness' or category == 'Sports & Fitness': category = 'HEALTH & SPORTS' elif category == 'Business & Professional' or category == 'Science & Technology' or category == 'School Activities' or category == 'Government & Politics': category = 'EDUCATION, BUSINESS & TECHNOLOGY' elif category == 'Charity & Causes' or category == 'Community & Culture' or category == 'Family & Education' or category == 'Home & Lifestyle' or category == 'Religion & Spirituality': category = 'COMMUNITY & FESTIVALS' elif category == 'Fashion & Beauty' or category == 'Film, Media & Entertainment' or category == 'Performing & Visual Arts': category = 'FASHION, ART & THEATRE' elif category == 'Food & Drink': category = 'FOOD & DRINK' elif category == 'FREE': category = 'FREE' elif category == 'Music' or category == 'Hobbies & Special Interest': category = 'MUSIC & ENTERTAINMENT' elif category == 'Travel & Outdoor' or category == 'Seasonal & Holiday': category = 'TOURISM & SIGHTSEEING' elif category == 'Other': category = 'OTHERS' try: img = events['logo']['original']['url'] except: img = 'none' Location = events['venue']['address'][ 'localized_multi_line_address_display'] # location formatting location = str(Location).strip('[]') location = location.split(',') try: location[0] = location[0].strip("''") except: pass try: location[1] = location[1].strip(" ''") except: pass # print(location[0]) # print(location[1]) try: location = location[0] + (' ') + location[1] except: location = location[0] if d1 > datetime.datetime.now(): data = EventData() data.id = uuid.uuid1().__str__() data.title = name data.time = time data.location = location data.summary = description data.img = img data.category = category data.startdate = start_date data.read_more = link # data.address = address data.enddate = end_date data.price = price data_list.append(data) # print(len(data)) print(len(data_list)) return data_list
def scrape(url, data_list): for value in range(1, 3): url = "https://www.knowledgetransferireland.com/Events/Upcoming-Events/?pageNumber={}".format( value) #url = url.format(value) print(url) try: uClient = uReq(url) except: pass page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") div = page_soup.findAll('div', {"class": "each-item"}) #final_list = [] for container in div: span_tags = container.div.findAll('span') p_tags = container.findAll('p') date1 = span_tags[0].text date2 = span_tags[1].text month = date2 date2 = datetime.datetime.strptime(date2, '%b').strftime('%B') date3 = span_tags[2].text startdate = date1 + ' ' + date2 + ' ' + date3 d1 = datetime.datetime(int(date3), int(month_string_to_number(month)), int(date1)) category = 'EDUCATION, BUSINESS & TECHNOLOGY' try: title = container.h2.text except: title = 'none' read_more = container.h2.a['href'] read_more = 'https://www.knowledgetransferireland.com/' + read_more try: description = container.p.text except: description = 'none' try: location = p_tags[2].text except: location = 'Dublin' location = location + "Dublin" ordinates = getOrdinates(location) img = 'https://uindia.net/assets/img/MediaTechnology.jpg' if d1 > datetime.datetime.now(): data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = '' data.location = location data.summary = description data.img = img data.category = category data.startdate = startdate data.read_more = read_more data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = '' data.price = '' data_list.append(data) return data_list
def scrape(urlOriginal, data_list): #49 #27 for value in range(1, 27): url = "" url = urlOriginal + format(value) print(url) try: JSONContent = requests.get(url).json() except: pass content = json.dumps(JSONContent, indent=4, sort_keys=True) #print(content) data = json.loads(content) var1 = data['_embedded'] for var1 in var1['events']: Title = var1["name"] URL = var1["url"] date1 = var1['dates']['start']['localDate'] date1 = date1.split('-') month = date1[1] date2 = datetime.datetime.strptime(month, '%m').strftime('%B') date = date1[2] + ' ' + date2 + ' ' + date1[0] d1 = datetime.datetime(int(date1[0]), int(date1[1]), int(date1[2])) d2 = datetime.datetime.now() try: Time = var1['dates']['start']['localTime'] except: Time = '' try: Address_Line_1 = var1['_embedded']['venues'][0]['address'][ 'line1'] except: Address_Line_1 = '' try: Address_Line_2 = var1['_embedded']['venues'][0]['address'][ 'line2'] except: Address_Line_2 = '' try: Postal_Code = var1['_embedded']['venues'][0]['postalCode'] except: Postal_Code = '' img = var1['images'][2]['url'] category = var1['classifications'][0]['segment']['name'] if category == 'Arts & Theatre' or category == '': category = 'FASHION, ART & THEATRE' elif category == 'Sport' or category == 'Sports': category = 'SPORTS & HEALTH' elif category == 'Family & Attractions': category = 'COMMUNITY & FESTIVALS' elif category == 'Music': category = 'MUSIC & ENTERTAINMENT' else: category = 'OTHERS' #Subcategory =var1['classifications'][0]['genre']['name'] Venue = var1['_embedded']['venues'][0]['name'] #Location = Venue+(' ')+ Address_Line_1 +(' ')+ Address_Line_2 +(' ')+ Postal_Code Location = Venue + "Dublin" try: if Location == 'Dublin': ordinates[2] = "The Spire,North City,Dublin" ordinates[0] = 53.3498091 ordinates[1] = -6.2602548 else: ordinates = getOrdinates(Location) except: continue if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = Title data.time = Time data.location = Location data.summary = '' data.img = img data.category = category data.startdate = date data.read_more = URL data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = '' data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(url,data_list): uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") # Finding each events div = page_soup.findAll('article',{"class":"col-sm-1-3 col-md-1-3 col-lg-4 item item-event"}) for container in div: title = container.h2.text a_tags=container.findAll('a') image='https://www.poetryireland.ie'+ a_tags[0].img['src'] read_more= a_tags[0]['href'] print(read_more) div_tags=container.findAll('div') date=div_tags[2].text.strip('\n\t') time_tag=date.split(',') time=time_tag[1] date=time_tag[0] #date formatting newdate = date.split(' ') date = newdate[1] month = newdate[2] monthTemp = month month = datetime.datetime.strptime(month,'%b').strftime('%B') year = datetime.datetime.now().year date = date + ' '+ month+' '+ year.__str__() d1 = datetime.datetime(int(year),int(month_string_to_number(monthTemp)),int(newdate[1])) location=div_tags[3].text location = location +(',')+ "Dublin" ordinates = getOrdinates(location) if str(ordinates) == 'Dublin': ordinates = getOrdinates("Dublin") read_more=a_tags[1]['href'] Category='FASHION, ART & THEATRE' if d1>datetime.datetime.now(): data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = time data.location = location data.summary = '' data.img = image data.category = Category data.startdate = date data.read_more = read_more data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = '' data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(urlOriginal, data_list): for value in range(1, 5): url = "" url = urlOriginal + format(value) + '/' print(url) uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") # Finding each events div = page_soup.findAll('article', {"class": "article whatsonarticle"}) for container in div: Title = container.h3.text.strip() try: image = container.a.img['src'] except: image = 'None' category = container.h5.text.strip() if category == 'Activities, Fashion' or category == 'Activities, Art, Workshop' or category == 'Art, Exhibition' or category == 'Activities, Christmas, Family, Theatre' or category == 'Activities, Theatre': category = 'FASHION, ART & THEATRE' if category == 'FASHION, ART & THEATRE' and image == 'None': image = 'https://4.bp.blogspot.com/-haQkpIywgPA/W5L1p-6P5JI/AAAAAAAANv4/279R0n1im_MugfsnYTlbf5ZiTaG2s7NYQCLcBGAs/s1600/Six_photoby_IdilSukan_18.jpg' elif category == 'Activities, Drinks, Family, Food And Drink,Nightlife,Talks, Workshop' or category == 'Food And Drink' or category == 'Drinks, Food And Drink, History, Tours' or category == 'Drinks, History, Tours' or category == 'Drinks, Food And Drink' or category == 'Drinks' or category == 'Activities, Drinks, Family, Food And Drink, Nightlife, Talks, Workshop' or category == 'Drinks' or category == 'Culture, Food And Drink': category = 'FOOD & DRINK' if category == 'FOOD & DRINK' and image == 'None': image = 'https://anandipaliwal.files.wordpress.com/2015/06/food-table-relisted.jpg' elif category == 'Activities, Culture, Exhibition': category = 'COMMUNITY & FESTIVALS' if category == 'COMMUNITY & FESTIVALS' and image == 'None': image = 'https://www.totallydublin.ie/wp-content/uploads/2017/07/Just-Eat-Street.jpg' elif category == 'Tours' or category == 'Music, Tours' or category == 'Culture, Tours' or category == 'Music, Nightlife' or category == 'Music' or category == 'Activities, Art, Culture, Drinks, Fashion, Food And Drink, Free, Market' or category == 'Activities, Comedy' or category == 'Activities, Culture, Exhibition' or category == 'Activities' or category == 'Christmas, Music' or category == 'Activities, Christmas, Cinema, Music' or category == 'Activities, Family': category = 'MUSIC & ENTERTAINMENT' if category == 'MUSIC & ENTERTAINMENT' and image == 'None': image = 'https://livestyle.com/wp-content/uploads/2017/07/slider-4.jpg' elif category == 'Theatre' or category == 'Fashion' or category == 'Culture, Theatre' or category == 'Halloween, Theatre' or category == 'Activities, Fashion' or category == 'Culture, Exhibition, Family, Food And Drink' or category == 'Activities, Art, Workshop' or category == 'Family, Theatre' or category == 'Activities, Halloween, Nightlife, Tours' or category == 'Beauty, Christmas': category = 'FASHION, ART & THEATRE' if category == 'FASHION, ART & THEATRE' and image == 'None': image = 'https://4.bp.blogspot.com/-haQkpIywgPA/W5L1p-6P5JI/AAAAAAAANv4/279R0n1im_MugfsnYTlbf5ZiTaG2s7NYQCLcBGAs/s1600/Six_photoby_IdilSukan_18.jpg' elif category == '': category = 'OTHERS' if category == 'OTHERS' and image == 'None': image = 'https://discuss.fm/images/max_topic_images/others.jpg' elif category == 'Beauty, Fitness': category = 'SPORTS & HEALTH' if category == 'SPORTS & HEALTH' and image == 'None': image = 'https://previews.123rf.com/images/tnn103eda/tnn103eda1705/tnn103eda170500019/79377445-huge-multi-sports-collage-soccer-basketball-football-hockey-baseball-boxing-etc.jpg' else: category = 'OTHERS' if category == 'OTHERS' and image == 'None': image = 'https://discuss.fm/images/max_topic_images/others.jpg' URL = container.a['href'] date = container.cite.text.strip('\n\t\t') split_date = date.split('-') start_date = split_date[0] # date formatting for start_Date format_date = start_date.split(' ') date = format_date[0] month = format_date[1] year = format_date[2] monthfull = datetime.datetime.strptime(month, '%b').strftime('%B') start_date = date + ' ' + monthfull + ' ' + year d1 = datetime.datetime(int(year), int(month_string_to_number(monthfull)), int(date)) start_date = start_date.strip('\t\t') try: end_date = split_date[1] except: end_date = 'None' # date formatting for end_date if end_date is not 'None': format_date = end_date.split(' ') date = format_date[1] month = format_date[2] year = format_date[3] monthfull = datetime.datetime.strptime(month, '%b').strftime('%B') end_date = date + ' ' + monthfull + ' ' + year d1 = datetime.datetime( int(year), int(month_string_to_number(monthfull)), int(date)) # date formatting for end_date a_tags = container.div.findAll('a') location = a_tags[2].text location = location.split('|') location = location[0] if location == 'The Grafton Quarter' or location == 'The Grafton Quarter Dublin': location = 'The Grafton street' elif location == 'Dublin One': location = 'Parnell street' else: location = location location = location + (' ') + "Dublin" ordinates = getOrdinates(location) if str(ordinates) == 'None': ordinates = getOrdinates("Dublin") if d1 > datetime.datetime.now(): data = EventData() data.id = uuid.uuid1().__str__() data.title = Title data.time = '' data.location = location data.summary = '' data.img = image data.category = category data.startdate = start_date data.read_more = URL data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = end_date data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(urlOriginal, data_list): for value in range(1, 11): url = "" url = urlOriginal + format(value) + '/' print(url) uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") # Finding each events article = page_soup.findAll('li', class_='sfnewsListItem sflistitem') item = page_soup.findAll('li', class_='sfrelatedListItem sflistitem') readmore = page_soup.findAll('div', class_='NewsEvent_right') for container in article: title = container.a.text div_tags = container.findAll('div') date = div_tags[0].text.strip() #date formatting new = date.split(',') year = new[2] new1 = new[1].split(' ') Date = new1[2] month = new1[1] fulldate = Date + '' + month + '' + year summary = div_tags[1].text.strip() location = "Dublin Business School, Dublin" category = "EDUCATION, BUSINESS & TECHNOLOGY" img = item[0].a.img['src'] read_more = readmore[0].a['href'] read_more = 'https://www.dbs.ie/about-dbs/news-and-events/' + read_more monthInt: int = month_string_to_number(month) d1 = datetime(int(year), monthInt, int(Date)) d2 = datetime.now() try: ordinates = getOrdinates(location) # if str(ordinates) == 'Dublin': # ordinates = getOrdinates("Dublin") except: continue if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.startdate = fulldate data.enddate = '' data.time = '' data.category = category data.price = '' data.summary = summary data.address = ordinates[2] data.location = location data.img = img data.latitude = ordinates[0] data.longitude = ordinates[1] data.read_more = read_more data_list.append(data) print(len(data_list)) return data_list
def scrape(url, data_list): uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") # Finding each events events_today = page_soup.find_all('div', class_="calendarEventsBlockWrap") events = events_today[0].find_all('div', class_="event-item clearfix") #print(events_today) for event in events: title = event.h3.a["title"] #print(title) read_more = event.find_all( 'a', {"class": "calendarImageLink"})[0]["data-link-url"] try: image = event.find_all( 'a', {"class": "calendarImageLink"})[0]['href'] # print(image) except: image = 'None' if image.endswith(".jpg") | image.endswith(".png"): image = image else: image = 'None' date = event.em.text start_date = date.split('through')[0].strip() try: Year1 = start_date.split(',')[1].strip() except: Year1 = 'None' Date1 = start_date.split(',')[0].split(' ')[1] Month1 = start_date.split(',')[0].split(' ')[2] start_date = Date1 + " " + Month1 + " " + Year1 end_date = date.split('through')[1].strip() Year2 = end_date.split(',')[1].strip() Date2 = end_date.split(',')[0].split(' ')[1] Month2 = end_date.split(',')[0].split(' ')[2] end_date = Date2 + " " + Month2 + " " + Year2 print(end_date) d1 = datetime(int(Year2.split('.')[0]), int(month_string_to_number(Month2)), int(Date2)) d2 = datetime.now() # # # description desc = event.div.text.strip() # # # Time,location,price,address header = [elem.next for elem in event.find_all('strong')] header = header[1:] # print(header) values = [elem.next.next for elem in event.find_all('strong')] values = values[1:] # print(values) j = 0 my_dict = {} for i in header: my_dict[i] = values[j].strip() j = j + 1 category = my_dict["Category:"] if category == 'Arts / Exhibits' or category == 'Comedy' or category == 'Theatre / Dance': category = 'FASHION, ART & THEATRE' if category == 'FASHION, ART & THEATRE' and image == 'None': image = 'https://4.bp.blogspot.com/-haQkpIywgPA/W5L1p-6P5JI/AAAAAAAANv4/279R0n1im_MugfsnYTlbf5ZiTaG2s7NYQCLcBGAs/s1600/Six_photoby_IdilSukan_18.jpg' elif category == 'Business Networking' or category == 'Canvention / Conference' or category == 'Educational' or category == 'Expo': category = 'EDUCATION, BUSINESS & TECHNOLOGY' if category == 'EDUCATION, BUSINESS & TECHNOLOGY' and image == 'None': image = 'https://uindia.net/assets/img/MediaTechnology.jpg' elif category == 'Concert / Live Music' or category == 'Cultutal' or category == 'Entertainment': category = 'MUSIC & ENTERTAINMENT' if category == 'MUSIC & ENTERTAINMENT' and image == 'None': image = 'https://livestyle.com/wp-content/uploads/2017/07/slider-4.jpg' elif category == 'Festival' or category == 'Kids / Family': category = 'COMMUNITY & FESTIVALS' if category == 'COMMUNITY & FESTIVALS' and image == 'None': image = 'https://www.totallydublin.ie/wp-content/uploads/2017/07/Just-Eat-Street.jpg' elif category == ' Health' or category == 'Sports': category = 'SPORTS & HEALTH' if category == 'SPORTS & HEALTH' and image == 'None': image = 'https://previews.123rf.com/images/tnn103eda/tnn103eda1705/tnn103eda170500019/79377445-huge-multi-sports-collage-soccer-basketball-football-hockey-baseball-boxing-etc.jpg' if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.img = image data.startdate = start_date data.enddate = end_date data.summary = desc data.time = my_dict["Time:"] data.location = my_dict["Location:"] + my_dict["Address:"] #data.address = my_dict["Address:"] data.read_more = read_more try: data.price = my_dict["Price:"] except: data.price = "SEE DESCRIPTION" data.category = category data_list.append(data) print(len(data_list)) return data_list