def scrape(urlOriginal, data_list): i = 0 #5 for value in range(1, 5): url = "" url = urlOriginal + format(value) print(url) try: uClient = uReq(url) except: pass page_html = uClient.read() uClient.close() #Parsing page_soup = soup(page_html, "html.parser") #article = page_soup.findAll('ul',class_='search-main-content__events-list') article_1 = page_soup.findAll('div', class_='search-event-card-wrapper') # fetching each details for container in article_1: title = container.findAll( 'div', class_='eds-event-card__formatted-name--is-clamped' )[0].text try: Date_time = container.findAll( 'div', class_= 'eds-text-color--primary-brand eds-l-pad-bot-1 eds-text-weight--heavy eds-text-bs' )[0].text except: Date_time = 'None' # try: # Location = container.findAll('div',class_='card-text--truncated__one')[0].text # except: # Location='None' try: Price = container.findAll( 'div', class_= 'eds-media-card-content__sub eds-text-bm eds-text-color--grey-600 eds-l-mar-top-1 eds-media-card-content__sub--cropped' )[1].text except: Price = 'None' a_tags = container.findAll('a') try: image = a_tags[0].img['src'] except: image = 'None' read_more = a_tags[0]['href'] print(read_more) category = 'TOURISM & SIGHTSEEING' if category == 'TOURISM & SIGHTSEEING' and image == 'None': image = 'https://www.fhi.no/globalassets/bilder/vaksine/oversikt-reisevaksine.jpg?preset=mainbodywidth' # description descurl = read_more #Opening connection , grabbing the page try: uClient = uReq(descurl) except: pass desc_html = uClient.read() uClient.close() #Parsing desc_soup = soup(desc_html, "html.parser") #print(desc_soup) desc = desc_soup.findAll( 'div', class_='js-xd-read-more-contents l-mar-top-3' ) or desc_soup.findAll( 'div', class_= 'structured-content-rich-text structured-content__module l-align-left l-mar-vert-6 l-sm-mar-vert-4 text-body-medium' ) if len(desc) > 0: try: p_tags = desc[0].findAll('p') except: continue descrip = [] for i in range(len(p_tags)): descript = p_tags[i].text descrip.append(descript) description = ''.join(str(e) for e in descrip) else: description = 'None' # date fetching and formatting time = desc_soup.findAll('time', class_='clrfix') if len(time) > 0: time_tags = time[0].findAll('p') date_check = time_tags[0].text if date_check == 'Multiple Dates' or date_check == 'Multiple Dates GMT' or date_check == 'Multiple Dates IST': Final_Date = date_check else: Date_time = date_check.split(',') if (len(Date_time)) == 2: Final_Date = Date_time[1].strip(' ') else: Mon_Date = Date_time[1].split(' ') if len(Mon_Date) == 3: Date = Mon_Date[2] month = Mon_Date[1] if len(month) <= 3: Month = datetime.datetime.strptime( month, '%b').strftime('%B') else: Month = month year = Date_time[2] Final_Date = Date + (' ') + Month + year elif len(Mon_Date) == 4: Date = Mon_Date[1] month = Mon_Date[2] Month = datetime.datetime.strptime( month, '%b').strftime('%B') year = Mon_Date[3] Final_Date = Date + (' ') + Month + ( ' ') + year else: Final_Date = 'None' #location fetching location_div = desc_soup.findAll('div', class_='event-details__data') if len(location_div) > 0: location_tags = location_div[1].findAll('p') locat = location_tags[0].text location = locat + (' ') + "Dublin" else: location = 'Dublin' print(location) try: if location == 'Dublin': ordinates[2] = "The Spire,North City,Dublin" ordinates[0] = 53.3498091 ordinates[1] = -6.2602548 else: ordinates = getOrdinates(location) except: continue try: d1 = datetime.datetime(int(year), int(month_string_to_number(Month)), int(Date)) except: continue d2 = datetime.datetime.now() if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = '' data.location = location data.summary = description data.img = image data.category = category data.startdate = Final_Date data.read_more = read_more data.enddate = '' data.price = Price data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data_list.append(data) i = i + 1 # print(len(data)) print(len(data_list)) return data_list
def scrape(urlOriginal, data_list): #4 for value in range(1, 4): url = "" url = urlOriginal + format(value) print(url) uClient = uReq(url) page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") div = page_soup.find_all('li', {"class": "wrapped wrapped-borderless"}) for container in div: Title = container.h3.text.strip() URL = 'https://www.rcsi.com' + container.a['href'] date = container.div.text.replace("\n", " ") # date formatting date_split = date.split(' ') date = date_split[1] if "-" in date: datecheck = date.split('-') date = datecheck[0] enddate1 = datecheck[1] month = date_split[2] month = datetime.datetime.strptime(month, '%b').strftime('%B') year = datetime.datetime.now().year date1 = date + ' ' + month + ' ' + year.__str__() d1 = datetime.datetime(int(year), int(month_string_to_number(month)), int(date)) try: enddate = enddate1 + ' ' + month + ' ' + year.__str__() d1 = datetime.datetime(int(year), int(month_string_to_number(month)), int(enddate1)) except: enddate = 'None' d2 = datetime.datetime.now() Location = container.p.text.strip('\n') try: if Location == 'Dublin': ordinates[2] = "The Spire,North City,Dublin" ordinates[0] = 53.3498091 ordinates[1] = -6.2602548 else: ordinates = getOrdinates(Location) except: continue ordinates = getOrdinates(Location) if str(ordinates) == 'Dublin': ordinates = getOrdinates("Dublin") Category = 'EDUCATION, BUSINESS & TECHNOLOGY' img = 'http://www.hrbcentreprimarycare.ie/images/rcsilogonewer.png' if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = Title data.time = '' data.location = Location data.summary = '' data.img = img data.category = Category data.startdate = date1 data.read_more = URL data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = enddate data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(urlOriginal, data_list): data_list = [] #136 #100--added for value in range(1, 136): url = "" url = urlOriginal + format(value) + '/' print(url) try: uClient = uReq(url) except: pass page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") det = page_soup.findAll('div', class_='details') for container in det: try: title = container.h2.text except: title = 'None' location = container.h3.text try: description = container.p.text.strip('\n') description = description.strip(' ') except: description = 'None' try: img = container.img['src'] except: img = 'None' read_more = container.a['href'] read = 'https://www.ireland.com/' + read_more category = 'TOURISM & SIGHTSEEING' if category == 'TOURISM & SIGHTSEEING' and img == 'None': img = 'https://www.fhi.no/globalassets/bilder/vaksine/oversikt-reisevaksine.jpg?preset=mainbodywidth' print(location) if location == 'Dublin': ordinates[2] = "The Spire,North City,Dublin" ordinates[0] = 53.3498091 ordinates[1] = -6.2602548 else: ordinates = getOrdinates(location) data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = '' data.location = location data.summary = description data.img = img data.category = category data.startdate = '' data.read_more = read data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = '' data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(url, data_list): for value in range(1, 3): url = "https://www.knowledgetransferireland.com/Events/Upcoming-Events/?pageNumber={}".format( value) #url = url.format(value) print(url) try: uClient = uReq(url) except: pass page_html = uClient.read() uClient.close() page_soup = soup(page_html, "html.parser") div = page_soup.findAll('div', {"class": "each-item"}) #final_list = [] for container in div: span_tags = container.div.findAll('span') p_tags = container.findAll('p') date1 = span_tags[0].text date2 = span_tags[1].text month = date2 date2 = datetime.datetime.strptime(date2, '%b').strftime('%B') date3 = span_tags[2].text startdate = date1 + ' ' + date2 + ' ' + date3 d1 = datetime.datetime(int(date3), int(month_string_to_number(month)), int(date1)) category = 'EDUCATION, BUSINESS & TECHNOLOGY' try: title = container.h2.text except: title = 'none' read_more = container.h2.a['href'] read_more = 'https://www.knowledgetransferireland.com/' + read_more try: description = container.p.text except: description = 'none' try: location = p_tags[2].text except: location = 'Dublin' location = location + "Dublin" ordinates = getOrdinates(location) img = 'https://uindia.net/assets/img/MediaTechnology.jpg' if d1 > datetime.datetime.now(): data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = '' data.location = location data.summary = description data.img = img data.category = category data.startdate = startdate data.read_more = read_more data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = '' data.price = '' data_list.append(data) return data_list
def scrape(urlOriginal, data_list): #49 #27 for value in range(1, 27): url = "" url = urlOriginal + format(value) print(url) try: JSONContent = requests.get(url).json() except: pass content = json.dumps(JSONContent, indent=4, sort_keys=True) #print(content) data = json.loads(content) var1 = data['_embedded'] for var1 in var1['events']: Title = var1["name"] URL = var1["url"] date1 = var1['dates']['start']['localDate'] date1 = date1.split('-') month = date1[1] date2 = datetime.datetime.strptime(month, '%m').strftime('%B') date = date1[2] + ' ' + date2 + ' ' + date1[0] d1 = datetime.datetime(int(date1[0]), int(date1[1]), int(date1[2])) d2 = datetime.datetime.now() try: Time = var1['dates']['start']['localTime'] except: Time = '' try: Address_Line_1 = var1['_embedded']['venues'][0]['address'][ 'line1'] except: Address_Line_1 = '' try: Address_Line_2 = var1['_embedded']['venues'][0]['address'][ 'line2'] except: Address_Line_2 = '' try: Postal_Code = var1['_embedded']['venues'][0]['postalCode'] except: Postal_Code = '' img = var1['images'][2]['url'] category = var1['classifications'][0]['segment']['name'] if category == 'Arts & Theatre' or category == '': category = 'FASHION, ART & THEATRE' elif category == 'Sport' or category == 'Sports': category = 'SPORTS & HEALTH' elif category == 'Family & Attractions': category = 'COMMUNITY & FESTIVALS' elif category == 'Music': category = 'MUSIC & ENTERTAINMENT' else: category = 'OTHERS' #Subcategory =var1['classifications'][0]['genre']['name'] Venue = var1['_embedded']['venues'][0]['name'] #Location = Venue+(' ')+ Address_Line_1 +(' ')+ Address_Line_2 +(' ')+ Postal_Code Location = Venue + "Dublin" try: if Location == 'Dublin': ordinates[2] = "The Spire,North City,Dublin" ordinates[0] = 53.3498091 ordinates[1] = -6.2602548 else: ordinates = getOrdinates(Location) except: continue if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = Title data.time = Time data.location = Location data.summary = '' data.img = img data.category = category data.startdate = date data.read_more = URL data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = '' data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(url,data_list): uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") # Finding each events div = page_soup.findAll('article',{"class":"col-sm-1-3 col-md-1-3 col-lg-4 item item-event"}) for container in div: title = container.h2.text a_tags=container.findAll('a') image='https://www.poetryireland.ie'+ a_tags[0].img['src'] read_more= a_tags[0]['href'] print(read_more) div_tags=container.findAll('div') date=div_tags[2].text.strip('\n\t') time_tag=date.split(',') time=time_tag[1] date=time_tag[0] #date formatting newdate = date.split(' ') date = newdate[1] month = newdate[2] monthTemp = month month = datetime.datetime.strptime(month,'%b').strftime('%B') year = datetime.datetime.now().year date = date + ' '+ month+' '+ year.__str__() d1 = datetime.datetime(int(year),int(month_string_to_number(monthTemp)),int(newdate[1])) location=div_tags[3].text location = location +(',')+ "Dublin" ordinates = getOrdinates(location) if str(ordinates) == 'Dublin': ordinates = getOrdinates("Dublin") read_more=a_tags[1]['href'] Category='FASHION, ART & THEATRE' if d1>datetime.datetime.now(): data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.time = time data.location = location data.summary = '' data.img = image data.category = Category data.startdate = date data.read_more = read_more data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = '' data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(urlOriginal, data_list): for value in range(1, 5): url = "" url = urlOriginal + format(value) + '/' print(url) uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") # Finding each events div = page_soup.findAll('article', {"class": "article whatsonarticle"}) for container in div: Title = container.h3.text.strip() try: image = container.a.img['src'] except: image = 'None' category = container.h5.text.strip() if category == 'Activities, Fashion' or category == 'Activities, Art, Workshop' or category == 'Art, Exhibition' or category == 'Activities, Christmas, Family, Theatre' or category == 'Activities, Theatre': category = 'FASHION, ART & THEATRE' if category == 'FASHION, ART & THEATRE' and image == 'None': image = 'https://4.bp.blogspot.com/-haQkpIywgPA/W5L1p-6P5JI/AAAAAAAANv4/279R0n1im_MugfsnYTlbf5ZiTaG2s7NYQCLcBGAs/s1600/Six_photoby_IdilSukan_18.jpg' elif category == 'Activities, Drinks, Family, Food And Drink,Nightlife,Talks, Workshop' or category == 'Food And Drink' or category == 'Drinks, Food And Drink, History, Tours' or category == 'Drinks, History, Tours' or category == 'Drinks, Food And Drink' or category == 'Drinks' or category == 'Activities, Drinks, Family, Food And Drink, Nightlife, Talks, Workshop' or category == 'Drinks' or category == 'Culture, Food And Drink': category = 'FOOD & DRINK' if category == 'FOOD & DRINK' and image == 'None': image = 'https://anandipaliwal.files.wordpress.com/2015/06/food-table-relisted.jpg' elif category == 'Activities, Culture, Exhibition': category = 'COMMUNITY & FESTIVALS' if category == 'COMMUNITY & FESTIVALS' and image == 'None': image = 'https://www.totallydublin.ie/wp-content/uploads/2017/07/Just-Eat-Street.jpg' elif category == 'Tours' or category == 'Music, Tours' or category == 'Culture, Tours' or category == 'Music, Nightlife' or category == 'Music' or category == 'Activities, Art, Culture, Drinks, Fashion, Food And Drink, Free, Market' or category == 'Activities, Comedy' or category == 'Activities, Culture, Exhibition' or category == 'Activities' or category == 'Christmas, Music' or category == 'Activities, Christmas, Cinema, Music' or category == 'Activities, Family': category = 'MUSIC & ENTERTAINMENT' if category == 'MUSIC & ENTERTAINMENT' and image == 'None': image = 'https://livestyle.com/wp-content/uploads/2017/07/slider-4.jpg' elif category == 'Theatre' or category == 'Fashion' or category == 'Culture, Theatre' or category == 'Halloween, Theatre' or category == 'Activities, Fashion' or category == 'Culture, Exhibition, Family, Food And Drink' or category == 'Activities, Art, Workshop' or category == 'Family, Theatre' or category == 'Activities, Halloween, Nightlife, Tours' or category == 'Beauty, Christmas': category = 'FASHION, ART & THEATRE' if category == 'FASHION, ART & THEATRE' and image == 'None': image = 'https://4.bp.blogspot.com/-haQkpIywgPA/W5L1p-6P5JI/AAAAAAAANv4/279R0n1im_MugfsnYTlbf5ZiTaG2s7NYQCLcBGAs/s1600/Six_photoby_IdilSukan_18.jpg' elif category == '': category = 'OTHERS' if category == 'OTHERS' and image == 'None': image = 'https://discuss.fm/images/max_topic_images/others.jpg' elif category == 'Beauty, Fitness': category = 'SPORTS & HEALTH' if category == 'SPORTS & HEALTH' and image == 'None': image = 'https://previews.123rf.com/images/tnn103eda/tnn103eda1705/tnn103eda170500019/79377445-huge-multi-sports-collage-soccer-basketball-football-hockey-baseball-boxing-etc.jpg' else: category = 'OTHERS' if category == 'OTHERS' and image == 'None': image = 'https://discuss.fm/images/max_topic_images/others.jpg' URL = container.a['href'] date = container.cite.text.strip('\n\t\t') split_date = date.split('-') start_date = split_date[0] # date formatting for start_Date format_date = start_date.split(' ') date = format_date[0] month = format_date[1] year = format_date[2] monthfull = datetime.datetime.strptime(month, '%b').strftime('%B') start_date = date + ' ' + monthfull + ' ' + year d1 = datetime.datetime(int(year), int(month_string_to_number(monthfull)), int(date)) start_date = start_date.strip('\t\t') try: end_date = split_date[1] except: end_date = 'None' # date formatting for end_date if end_date is not 'None': format_date = end_date.split(' ') date = format_date[1] month = format_date[2] year = format_date[3] monthfull = datetime.datetime.strptime(month, '%b').strftime('%B') end_date = date + ' ' + monthfull + ' ' + year d1 = datetime.datetime( int(year), int(month_string_to_number(monthfull)), int(date)) # date formatting for end_date a_tags = container.div.findAll('a') location = a_tags[2].text location = location.split('|') location = location[0] if location == 'The Grafton Quarter' or location == 'The Grafton Quarter Dublin': location = 'The Grafton street' elif location == 'Dublin One': location = 'Parnell street' else: location = location location = location + (' ') + "Dublin" ordinates = getOrdinates(location) if str(ordinates) == 'None': ordinates = getOrdinates("Dublin") if d1 > datetime.datetime.now(): data = EventData() data.id = uuid.uuid1().__str__() data.title = Title data.time = '' data.location = location data.summary = '' data.img = image data.category = category data.startdate = start_date data.read_more = URL data.address = ordinates[2] data.latitude = ordinates[0] data.longitude = ordinates[1] data.enddate = end_date data.price = '' data_list.append(data) print(len(data_list)) return data_list
def scrape(urlOriginal, data_list): for value in range(1, 11): url = "" url = urlOriginal + format(value) + '/' print(url) uClient = uReq(url) page_html = uClient.read() uClient.close() # Finding each events page_soup = soup(page_html, "html.parser") # Finding each events article = page_soup.findAll('li', class_='sfnewsListItem sflistitem') item = page_soup.findAll('li', class_='sfrelatedListItem sflistitem') readmore = page_soup.findAll('div', class_='NewsEvent_right') for container in article: title = container.a.text div_tags = container.findAll('div') date = div_tags[0].text.strip() #date formatting new = date.split(',') year = new[2] new1 = new[1].split(' ') Date = new1[2] month = new1[1] fulldate = Date + '' + month + '' + year summary = div_tags[1].text.strip() location = "Dublin Business School, Dublin" category = "EDUCATION, BUSINESS & TECHNOLOGY" img = item[0].a.img['src'] read_more = readmore[0].a['href'] read_more = 'https://www.dbs.ie/about-dbs/news-and-events/' + read_more monthInt: int = month_string_to_number(month) d1 = datetime(int(year), monthInt, int(Date)) d2 = datetime.now() try: ordinates = getOrdinates(location) # if str(ordinates) == 'Dublin': # ordinates = getOrdinates("Dublin") except: continue if d1 > d2: data = EventData() data.id = uuid.uuid1().__str__() data.title = title data.startdate = fulldate data.enddate = '' data.time = '' data.category = category data.price = '' data.summary = summary data.address = ordinates[2] data.location = location data.img = img data.latitude = ordinates[0] data.longitude = ordinates[1] data.read_more = read_more data_list.append(data) print(len(data_list)) return data_list