def scrape(url, data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()

        # Finding each events
        page_soup = soup(page_html, "html.parser")

        # Finding each events
        div = page_soup.findAll('div', {"class": "wrapBlock wrapBlockNews"})

        for container in div:
            # container = div[0]
            title = container.h2.text
            date = container.em.text.strip('()')
            date = date.split('/')
            dateArr = date
            month = date[1]
            # date format change
            month = datetime.datetime.strptime(month, '%m').strftime('%B')
            #date = date[0] + (' ') + month + (' ') + date[2]
            date = date[2] + '' + month + '' + date[0]

            d1 = datetime.datetime(int(dateArr[2]),
                                   int(month_string_to_number(month)),
                                   int(dateArr[0]))
            d2 = datetime.datetime.now()
            #wh = container.p.text
            #wh = str(wh)
            location = 'Whitehall College of Further Education, Drumcondra, Dublin'

            ordinates = getOrdinates(location)
            read_more = container.a['href']
            Category = 'EDUCATION, BUSINESS & TECHNOLOGY'
            img = 'https://whitehallcollege.com/uploads/pages/logo.jpg'

            if d1 > d2:
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.startdate = date
                data.enddate = ''
                data.time = ''
                data.category = Category
                data.price = ''
                data.summary = ''
                data.location = location
                data.address = ordinates[2]
                data.latitude = ordinates[0]
                data.longitude = ordinates[1]
                data.img = img
                data.read_more = read_more
                data_list.append(data)

        print(len(data_list))
        return data_list
Exemple #2
0
    def scrape(url, data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()

        # Finding each events
        page_html = soup(page_html, "lxml")
        # Finding each events
        article = page_html.find_all('div', class_='news_snippet')

        for container in article:
            # container = article[0]
            title = container.h3.text.strip()
            p_tags = container.findAll('p')
            olddate = p_tags[0].text.strip()
            #date formatting
            datesplit = olddate.split('/')
            date = datesplit[0]
            month = datesplit[1]
            monthTemp = month
            month = datetime.datetime.strptime(month, '%m').strftime('%B')
            year = datesplit[2].split(' ')
            year = year[0]

            date = date + ' ' + month + ' ' + year

            d1 = datetime.datetime(int(year), int(monthTemp), int(date))

            summary = p_tags[1].text.strip()
            read_more = container.a['href']
            read_more = 'https://www.itb.ie/NewsEvents/' + read_more
            location = "Technological University Dublin, Blanchardstown, Dublin 15"
            Category = 'EDUCATION, BUSINESS & TECHNOLOGY'
            image = 'https://uindia.net/assets/img/MediaTechnology.jpg'

            if d1 > datetime.datetime.now():
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.img = image
                data.startdate = date
                data.enddate = ' '
                data.price = ' '
                data.summary = summary
                data.time = ''
                data.location = location
                data.read_more = read_more
                data.category = Category
                data_list.append(data)

        print(len(data_list))
        return data_list
    def scrape(url,data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()

        # Finding each events
        page_html = soup(page_html, "html.parser")

        # Finding each events
        article = page_html.find_all('div',{"class":"article-list__col col-sm-6 col-md-3"})

        for arti in article:
    
            read_more = "https://tudublin.ie"+ article[0].a["href"]

            date = arti.find_all('p')[0].text.strip()
            new_date = date.split(',')[0].split(' ')[0]
            month = date.split(',')[0].split(' ')[1]
            year = date.split(',')[1].strip()
        
            monthTemp = month
            month = datetime.datetime.strptime(month,'%b').strftime('%B')
            date = new_date+" "+month+" "+year
    
            d1 = datetime.datetime(int(year),int(month_string_to_number(monthTemp)),int(new_date))

            category = "EDUCATION, BUSINESS & TECHNOLOGY"
            location = arti.find_all('li',{"class":"article-list__location"})[0].text.strip()
            desc = arti.find_all('p')[1].text.strip()
            title = arti.h3.text
            image = "https://tudublin.ie" + arti.img["src"]
            time = arti.find_all('li',{"class":"article-list__time"})[0].text.strip()

            if d1>datetime.datetime.now():
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.img = image
                data.startdate = date
                data.enddate = ' '
                data.price = ' '
                data.summary = desc
                data.time = time
                data.location = location
                data.read_more = read_more
                data.category = category
                data_list.append(data)


        print(len(data_list))
        return data_list
    def scrape(urlOriginal, data_list):

        i = 0
        #3
        for value in range(1, 3):
            url = ""
            url = urlOriginal + format(value)
            print(url)
            try:
                uClient = uReq(url)
            except:
                pass
            page_html = uClient.read()
            uClient.close()
            #Parsing
            page_soup = soup(page_html, "html.parser")
            #article = page_soup.findAll('ul',class_='search-main-content__events-list')
            article_1 = page_soup.findAll('div',
                                          class_='search-event-card-wrapper')

            # fetching each details
            for container in article_1:
                title = container.findAll(
                    'div', class_='eds-event-card__formatted-name--is-clamped'
                )[0].text

                try:
                    Date_time = container.findAll(
                        'div',
                        class_=
                        'eds-text-color--primary-brand eds-l-pad-bot-1 eds-text-weight--heavy eds-text-bs'
                    )[0].text
                except:
                    Date_time = 'None'
                # try:
                #     Location = container.findAll('div',class_='card-text--truncated__one')[0].text
                # except:
                #     Location='None'
                try:
                    Price = container.findAll(
                        'div',
                        class_=
                        'eds-media-card-content__sub eds-text-bm eds-text-color--grey-600 eds-l-mar-top-1 eds-media-card-content__sub--cropped'
                    )[1].text
                except:
                    Price = 'None'
                a_tags = container.findAll('a')
                try:
                    image = a_tags[0].img['src']
                except:
                    image = 'None'
                read_more = a_tags[0]['href']
                print(read_more)

                category = 'COMMUNITY & FESTIVALS'
                if category == 'COMMUNITY & FESTIVALS' and image == 'None':
                    image = 'https://www.totallydublin.ie/wp-content/uploads/2017/07/Just-Eat-Street.jpg'

                # description

                descurl = read_more
                #Opening connection , grabbing the page
                try:
                    uClient = uReq(descurl)
                except:
                    pass
                desc_html = uClient.read()
                uClient.close()
                #Parsing
                desc_soup = soup(desc_html, "html.parser")

                desc = desc_soup.findAll(
                    'div', class_='js-xd-read-more-contents l-mar-top-3'
                ) or desc_soup.findAll(
                    'div',
                    class_=
                    'structured-content-rich-text structured-content__module l-align-left l-mar-vert-6 l-sm-mar-vert-4 text-body-medium'
                )
                if len(desc) > 0:
                    try:
                        p_tags = desc[0].findAll('p')
                    except:
                        continue

                    descrip = []
                    for i in range(len(p_tags)):
                        descript = p_tags[i].text
                        descrip.append(descript)
                    description = ''.join(str(e) for e in descrip)
                else:
                    description = 'None'

                # Date fetching and formatting

                time = desc_soup.findAll('time', class_='clrfix')
                if len(time) > 0:
                    time_tags = time[0].findAll('p')
                    date_check = time_tags[0].text

                    if date_check == 'Multiple Dates' or date_check == 'Multiple Dates GMT' or date_check == 'Multiple Dates IST':
                        Final_Date = date_check

                    else:
                        Date_time = date_check.split(',')

                        if (len(Date_time)) == 2:
                            Final_Date = Date_time[1].strip(' ')

                        else:
                            Mon_Date = Date_time[1].split(' ')
                            if len(Mon_Date) == 3:
                                Date = Mon_Date[2]
                                month = Mon_Date[1]
                                if len(month) <= 3:
                                    Month = datetime.datetime.strptime(
                                        month, '%b').strftime('%B')
                                else:
                                    Month = month
                                year = Date_time[2]
                                Final_Date = Date + (' ') + Month + year

                            elif len(Mon_Date) == 4:
                                Date = Mon_Date[1]
                                month = Mon_Date[2]
                                Month = datetime.datetime.strptime(
                                    month, '%b').strftime('%B')
                                year = Mon_Date[3]
                                Final_Date = Date + (' ') + Month + (
                                    ' ') + year

                else:
                    Final_Date = 'None'

                #location fetching
                location_div = desc_soup.findAll('div',
                                                 class_='event-details__data')
                if len(location_div) > 0:
                    location_tags = location_div[1].findAll('p')
                    locat = location_tags[0].text
                    location = locat + (' ') + "Dublin"
                else:
                    location = 'Dublin'

                print(location)

                try:

                    if location == 'Dublin':
                        ordinates[2] = "The Spire,North City,Dublin"
                        ordinates[0] = 53.3498091
                        ordinates[1] = -6.2602548

                    else:
                        ordinates = getOrdinates(location)

                except:
                    continue

                try:
                    d1 = datetime.datetime(int(year),
                                           int(month_string_to_number(Month)),
                                           int(Date))
                except:
                    continue

                d2 = datetime.datetime.now()

                if d1 > d2:
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = title
                    data.time = ''
                    data.location = location
                    data.summary = description
                    data.img = image
                    data.category = category
                    data.address = ordinates[2]
                    data.startdate = Final_Date
                    data.read_more = read_more
                    data.enddate = ''
                    data.price = Price
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data_list.append(data)
                    i = i + 1

            # print(len(data))

        print(len(data_list))
        return data_list
    def scrape(url,data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()



        # Finding each events
        page_soup = soup(page_html, "html.parser")
        # Finding each events
        div = page_soup.findAll('div',{"class":"grid-item"})
        for container in div:
            try:
                url=container.a['href']
            except:
                url=''
            try:
                image=container.a.img['src']
            except:
                image='None'
            title = container.h3.text
            date=container.h2.text
            #date formatting
            datesplit = date.split('/')
            date_month_year = datesplit[0].split('.')
            date = date_month_year[0].split(' ')
            date = date[1]
            dateTemp = date
            month = date_month_year[1]
            monthTemp = month
            month = datetime.datetime.strptime(month,'%m').strftime('%B')
            year = date_month_year[2]
            date = date + ' '+ month+' '+ year

            d1 = datetime.datetime(int(year), int(monthTemp), int(dateTemp))
            d2 = datetime.datetime.now()

            div_tags = container.findAll('div')
            time = datesplit[1]
            div_tags = container.findAll('div')
            price = div_tags[1].p.text
            location = "Sugar Club, Leeson Street, Dublin"

            
                
            ordinates = getOrdinates(location)
            

            category = "MUSIC & ENTERTAINMENT"
            if category == 'MUSIC & ENTERTAINMENT' and image == 'None':
                image = 'https://livestyle.com/wp-content/uploads/2017/07/slider-4.jpg'

            if d1>d2:
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.time = time
                data.location = location
                data.summary = ''
                data.img = image
                data.category = category
                data.startdate = date
                data.read_more = url
                data.address = ordinates[2]
                data.latitude = ordinates[0]
                data.longitude = ordinates[1] 
                data.enddate = ''
                data.price = price
                data_list.append(data)

        print(len(data_list))

        return data_list
    def scrape(url,data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()

        page_soup = soup(page_html, "html.parser")

        # Finding each events
        article = page_soup.find_all('article', class_="event card")


        for container in article:
            # container = article[0]
            div = container.find_all('div', class_="text")
            v = div[0].a["href"]
            img = container.div["style"]
            pattern = r"(?<=\(')[^'\)]*"
            img = re.search(pattern, img)
            img = img[0]
            category = container["data-categories"]
            if category=='ART & THEATRE':
                category='FASHION, ART & THEATRE'
            elif category=='BUSINESS & TECH':
                category='EDUCATION, BUSINESS & TECHNOLOGY'
            elif category=='FAMILY FRIENDLY':
                category='COMMUNITY & FESTIVALS'
            elif category=='FESTIVALS':
                category='COMMUNITY & FESTIVALS'
            elif category=='FILM & LITERATURE':
                category='FASHION, ART & THEATRE'
            elif category=='FOOD & DRINK':
                category='FOOD & DRINK'
            elif category=='FREE':
                category='FREE'
            elif category=='LEARNING':
                category='EDUCATION, BUSINESS & TECHNOLOGY'
            elif category=='MUSIC & COMEDY':
                category='MUSIC & ENTERTAINMENT'
            elif category=='SPORTS':
                category='SPORTS & HEALTH'
            url2 = v
            uClient = uReq(url2)
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            div = page_soup.find_all('div', class_="event-info")
            details = div[0].findAll('dd')

            datadict = {}
            var1 = div[0].findAll('dt')
            for i in range(len(var1)):
                datadict[var1[i].text] = details[i].text

            date = datadict['Date:']
            time = datadict['Time:']
            if any('Price:' in x for x in datadict):
                price = datadict['Price:']
            else:
                continue

            #address = datadict['Address:']
            var2 = date.split('-')
            startdate = var2[0]
            startdate = startdate.split(' ')
            start = startdate[1].replace('th','').replace('nd','').replace('st','').replace('rd','')
            date = start.split(' ')
            date = start
            month = startdate[2]
            year = datetime.now().year
            startdate = date + ' '+ month+' '+ year.__str__()
            d1 = datetime(year,int(month_string_to_number(month)),int(date))
            try:
                enddate = var2[1]
                enddateArr = enddate.split(' ')
                end=enddateArr[1].replace('th','').replace('nd','').replace('st','').replace('rd','')

                d1 = datetime(year, int(month_string_to_number(enddateArr[2])), int(end))

            except:
                enddate = 'None'
            title = page_soup.h1.text
            #time = container.time.text
            p_tags = container.findAll('p')
            location = p_tags[0].text
            summary = p_tags[1].text.strip()
            read_more = url2

            if d1>datetime.now():
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.time = time
                data.location = location
                data.summary = summary
                data.img = img
                data.category = category
                data.startdate = startdate
                data.read_more = read_more
                #data.address = address
                data.enddate = enddate
                data.price = price
                data_list.append(data)

        print(len(data_list))
        
        return data_list
Exemple #7
0
    def scrape(urlOriginal, data_list):
        data_list = []
        #136
        #100--added
        for value in range(1, 136):

            url = ""
            url = urlOriginal + format(value) + '/'
            print(url)
            try:
                uClient = uReq(url)
            except:
                pass
            page_html = uClient.read()
            uClient.close()
            # Finding each events
            page_soup = soup(page_html, "html.parser")
            det = page_soup.findAll('div', class_='details')
            for container in det:
                try:
                    title = container.h2.text
                except:
                    title = 'None'

                location = container.h3.text
                try:
                    description = container.p.text.strip('\n')
                    description = description.strip(' ')
                except:
                    description = 'None'
                try:
                    img = container.img['src']
                except:
                    img = 'None'

                read_more = container.a['href']
                read = 'https://www.ireland.com/' + read_more
                category = 'TOURISM & SIGHTSEEING'
                if category == 'TOURISM & SIGHTSEEING' and img == 'None':
                    img = 'https://www.fhi.no/globalassets/bilder/vaksine/oversikt-reisevaksine.jpg?preset=mainbodywidth'

                print(location)

                if location == 'Dublin':
                    ordinates[2] = "The Spire,North City,Dublin"
                    ordinates[0] = 53.3498091
                    ordinates[1] = -6.2602548
                else:
                    ordinates = getOrdinates(location)

                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.time = ''
                data.location = location
                data.summary = description
                data.img = img
                data.category = category
                data.startdate = ''
                data.read_more = read
                data.address = ordinates[2]
                data.latitude = ordinates[0]
                data.longitude = ordinates[1]
                data.enddate = ''
                data.price = ''
                data_list.append(data)

        print(len(data_list))

        return data_list
    def scrape(urlOriginal, data_list):
        #4
        for value in range(1, 4):
            url = ""
            url = urlOriginal + format(value)
            print(url)

            uClient = uReq(url)
            page_html = uClient.read()
            uClient.close()

            page_soup = soup(page_html, "html.parser")
            div = page_soup.find_all('li',
                                     {"class": "wrapped wrapped-borderless"})

            for container in div:
                Title = container.h3.text.strip()
                URL = 'https://www.rcsi.com' + container.a['href']
                date = container.div.text.replace("\n", " ")

                # date formatting
                date_split = date.split(' ')
                date = date_split[1]
                if "-" in date:
                    datecheck = date.split('-')
                    date = datecheck[0]
                    enddate1 = datecheck[1]
                month = date_split[2]
                month = datetime.datetime.strptime(month, '%b').strftime('%B')
                year = datetime.datetime.now().year
                date1 = date + ' ' + month + ' ' + year.__str__()
                d1 = datetime.datetime(int(year),
                                       int(month_string_to_number(month)),
                                       int(date))

                try:
                    enddate = enddate1 + ' ' + month + ' ' + year.__str__()
                    d1 = datetime.datetime(int(year),
                                           int(month_string_to_number(month)),
                                           int(enddate1))
                except:
                    enddate = 'None'

                d2 = datetime.datetime.now()

                Location = container.p.text.strip('\n')

                try:

                    if Location == 'Dublin':
                        ordinates[2] = "The Spire,North City,Dublin"
                        ordinates[0] = 53.3498091
                        ordinates[1] = -6.2602548

                    else:
                        ordinates = getOrdinates(Location)

                except:
                    continue

                ordinates = getOrdinates(Location)
                if str(ordinates) == 'Dublin':
                    ordinates = getOrdinates("Dublin")

                Category = 'EDUCATION, BUSINESS & TECHNOLOGY'
                img = 'http://www.hrbcentreprimarycare.ie/images/rcsilogonewer.png'

                if d1 > d2:
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = Title
                    data.time = ''
                    data.location = Location
                    data.summary = ''
                    data.img = img
                    data.category = Category
                    data.startdate = date1
                    data.read_more = URL
                    data.address = ordinates[2]
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.enddate = enddate
                    data.price = ''
                    data_list.append(data)

        print(len(data_list))

        return data_list
    def scrape(url, data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()

        page_soup = soup(page_html, "html.parser")

        article = page_soup.find_all('div', {"class": "c-feed-box-outer"})
        for arti in article:
            url2 = 'https://portal.dublinchamberhosting.com' + arti.a["href"]
            #Opening connection to second page, grabbing the page
            uClient = uReq(url2)
            page_html = uClient.read()
            uClient.close()
            #Parsing2
            page_soup = soup(page_html, "html.parser")
            #print(page_soup)
            try:
                image = page_soup.find_all(
                    'div', {
                        "class":
                        "c-banner_background-image o-cropcontent o-crop_content--center"
                    })[0].img["src"]
            except:
                image = 'https://uindia.net/assets/img/MediaTechnology.jpg'

            desc = page_soup.find_all(
                'div', {"class": "description"})[0].p.text.strip()
            try:
                desc = desc.span.text.strip()
            except:
                desc = desc
            event_box = page_soup.find_all(
                'div', {"class": "c-event-booking__right-column"})[0]
            event_box = event_box.find_all(
                'div', {"class": "c-event_event-info"})[0].find_all(
                    'div', {"class": "c-event-info_value"})
            title = event_box[0].text.strip()
            date = event_box[1].text.strip()
            #Date formatting
            newdate = date.split(' ')
            date = newdate[1]
            month = newdate[2]
            month = datetime.datetime.strptime(month, '%b').strftime('%B')
            year = newdate[3]
            date = date + ' ' + month + ' ' + year
            time = event_box[2].text.strip()
            address = event_box[3].text.strip()
            category = "EDUCATION, BUSINESS & TECHNOLOGY"

            d1 = datetime.datetime(int(year), int(newdate[2]), int(date))
            d2 = datetime.datetime.now()

            if d1 > d2:
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.time = time
                data.location = address
                data.summary = desc
                data.img = image
                data.category = category
                data.startdate = date
                data.read_more = url2
                #data.address = address
                data.enddate = ''
                data.price = ''
                data_list.append(data)

        print(len(data_list))

        return data_list
    def scrape(urlOriginal, data_list):

        i = 0
        for value in range(1, 67):
            url = ""
            url = urlOriginal + format(value)
            print(url)
            #JSONContent = requests.get(url).json()

            uh = uReq(url)
            data = uh.read()
            print('Retrieved', len(data), 'characters')

            JSONContent = json.loads(data.decode("utf-8"))

            content = json.dumps(JSONContent, indent=4, sort_keys=True)
            # print(content)
            data = json.loads(content)
            # location and categories
            # replacing category_id with their corresponding category name
            # scraping category id and name
            url1 = "https://www.eventbriteapi.com/v3/categories/?token=4KFS7BDPSZ5A5KWQ62KZ"
            catJSONContent = requests.get(url1).json()
            catcontent = json.dumps(catJSONContent, indent=4, sort_keys=True)
            # print(content)
            categorylist = json.loads(catcontent)

            category_list = []

            for categories in categorylist['categories']:
                name = categories['name']
                id_cat = categories['id']
                categoryy = name, id_cat
                category_list.append(categoryy)

            category_list = list(category_list)

            for events in data['events']:
                name = events['name']['text']
                description = events['description']['text']
                description = format(description)
                link = events['url']
                start_datetime = events['start']['local']

                # formating start date and time
                start_date_split = start_datetime.split('T')
                start_date = start_date_split[0]
                start_time = start_date_split[1]
                start_date = start_date.split('-')
                date = start_date[2]
                month = start_date[1]
                year = start_date[0]
                month = datetime.datetime.strptime(month, '%m').strftime('%B')
                start_date = date + ' ' + month + ' ' + year

                # end_date
                end_datetime = events['end']['local']

                # formating end date and time
                end_date_split = end_datetime.split('T')
                end_date = end_date_split[0]
                end_time = end_date_split[1]
                time = start_time + ('-') + end_time
                end_date = end_date.split('-')
                date = end_date[2]
                month = end_date[1]
                monthTemp = end_date[1]
                year = end_date[0]
                month = datetime.datetime.strptime(month, '%m').strftime('%B')
                end_date = date + (' ') + month + (' ') + year

                d1 = datetime.datetime(int(year), int(monthTemp), int(date))
                # event price
                free_event = events['is_free']
                if free_event == True:
                    price = 'free'
                else:
                    price = 'check link for more details'
                category_id = events['category_id']
                # replacing category_id with category name
                for each in category_list:
                    if category_id in each[1]:
                        category = each[0]

                # Category Uniformication
                if category == 'Auto, Boat & Air' or category == 'Health & Wellness' or category == 'Sports & Fitness':
                    category = 'HEALTH & SPORTS'
                elif category == 'Business & Professional' or category == 'Science & Technology' or category == 'School Activities' or category == 'Government & Politics':
                    category = 'EDUCATION, BUSINESS & TECHNOLOGY'
                elif category == 'Charity & Causes' or category == 'Community & Culture' or category == 'Family & Education' or category == 'Home & Lifestyle' or category == 'Religion & Spirituality':
                    category = 'COMMUNITY & FESTIVALS'
                elif category == 'Fashion & Beauty' or category == 'Film, Media & Entertainment' or category == 'Performing & Visual Arts':
                    category = 'FASHION, ART & THEATRE'
                elif category == 'Food & Drink':
                    category = 'FOOD & DRINK'
                elif category == 'FREE':
                    category = 'FREE'
                elif category == 'Music' or category == 'Hobbies & Special Interest':
                    category = 'MUSIC & ENTERTAINMENT'
                elif category == 'Travel & Outdoor' or category == 'Seasonal & Holiday':
                    category = 'TOURISM & SIGHTSEEING'
                elif category == 'Other':
                    category = 'OTHERS'

                try:
                    img = events['logo']['original']['url']
                except:
                    img = 'none'
                Location = events['venue']['address'][
                    'localized_multi_line_address_display']
                # location formatting
                location = str(Location).strip('[]')
                location = location.split(',')
                try:
                    location[0] = location[0].strip("''")
                except:
                    pass
                try:
                    location[1] = location[1].strip(" ''")
                except:
                    pass
                # print(location[0])
                # print(location[1])
                try:
                    location = location[0] + (' ') + location[1]
                except:
                    location = location[0]

                if d1 > datetime.datetime.now():
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = name
                    data.time = time
                    data.location = location
                    data.summary = description
                    data.img = img
                    data.category = category
                    data.startdate = start_date
                    data.read_more = link
                    # data.address = address
                    data.enddate = end_date
                    data.price = price
                    data_list.append(data)

            # print(len(data))

        print(len(data_list))
        return data_list
    def scrape(url, data_list):

        for value in range(1, 3):
            url = "https://www.knowledgetransferireland.com/Events/Upcoming-Events/?pageNumber={}".format(
                value)
            #url = url.format(value)
            print(url)
            try:
                uClient = uReq(url)
            except:
                pass
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            div = page_soup.findAll('div', {"class": "each-item"})

            #final_list = []

            for container in div:
                span_tags = container.div.findAll('span')
                p_tags = container.findAll('p')
                date1 = span_tags[0].text
                date2 = span_tags[1].text
                month = date2
                date2 = datetime.datetime.strptime(date2, '%b').strftime('%B')
                date3 = span_tags[2].text
                startdate = date1 + ' ' + date2 + ' ' + date3
                d1 = datetime.datetime(int(date3),
                                       int(month_string_to_number(month)),
                                       int(date1))
                category = 'EDUCATION, BUSINESS & TECHNOLOGY'
                try:
                    title = container.h2.text
                except:
                    title = 'none'
                read_more = container.h2.a['href']
                read_more = 'https://www.knowledgetransferireland.com/' + read_more
                try:
                    description = container.p.text
                except:
                    description = 'none'
                try:
                    location = p_tags[2].text
                except:
                    location = 'Dublin'
                location = location + "Dublin"

                ordinates = getOrdinates(location)

                img = 'https://uindia.net/assets/img/MediaTechnology.jpg'

                if d1 > datetime.datetime.now():
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = title
                    data.time = ''
                    data.location = location
                    data.summary = description
                    data.img = img
                    data.category = category
                    data.startdate = startdate
                    data.read_more = read_more
                    data.address = ordinates[2]
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.enddate = ''
                    data.price = ''
                    data_list.append(data)

        return data_list
Exemple #12
0
    def scrape(urlOriginal, data_list):
        #49
        #27
        for value in range(1, 27):
            url = ""
            url = urlOriginal + format(value)
            print(url)
            try:
                JSONContent = requests.get(url).json()
            except:
                pass
            content = json.dumps(JSONContent, indent=4, sort_keys=True)
            #print(content)
            data = json.loads(content)

            var1 = data['_embedded']

            for var1 in var1['events']:
                Title = var1["name"]
                URL = var1["url"]
                date1 = var1['dates']['start']['localDate']
                date1 = date1.split('-')
                month = date1[1]
                date2 = datetime.datetime.strptime(month, '%m').strftime('%B')
                date = date1[2] + ' ' + date2 + ' ' + date1[0]
                d1 = datetime.datetime(int(date1[0]), int(date1[1]),
                                       int(date1[2]))
                d2 = datetime.datetime.now()
                try:
                    Time = var1['dates']['start']['localTime']
                except:
                    Time = ''
                try:
                    Address_Line_1 = var1['_embedded']['venues'][0]['address'][
                        'line1']
                except:
                    Address_Line_1 = ''
                try:
                    Address_Line_2 = var1['_embedded']['venues'][0]['address'][
                        'line2']
                except:
                    Address_Line_2 = ''
                try:
                    Postal_Code = var1['_embedded']['venues'][0]['postalCode']
                except:
                    Postal_Code = ''
                img = var1['images'][2]['url']
                category = var1['classifications'][0]['segment']['name']
                if category == 'Arts & Theatre' or category == '':
                    category = 'FASHION, ART & THEATRE'
                elif category == 'Sport' or category == 'Sports':
                    category = 'SPORTS & HEALTH'
                elif category == 'Family & Attractions':
                    category = 'COMMUNITY & FESTIVALS'
                elif category == 'Music':
                    category = 'MUSIC & ENTERTAINMENT'
                else:
                    category = 'OTHERS'
                #Subcategory =var1['classifications'][0]['genre']['name']
                Venue = var1['_embedded']['venues'][0]['name']
                #Location = Venue+(' ')+ Address_Line_1 +(' ')+ Address_Line_2 +(' ')+ Postal_Code
                Location = Venue + "Dublin"

                try:

                    if Location == 'Dublin':
                        ordinates[2] = "The Spire,North City,Dublin"
                        ordinates[0] = 53.3498091
                        ordinates[1] = -6.2602548

                    else:
                        ordinates = getOrdinates(Location)

                except:
                    continue

                if d1 > d2:
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = Title
                    data.time = Time
                    data.location = Location
                    data.summary = ''
                    data.img = img
                    data.category = category
                    data.startdate = date
                    data.read_more = URL
                    data.address = ordinates[2]
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.enddate = ''
                    data.price = ''
                    data_list.append(data)

        print(len(data_list))

        return data_list
    def scrape(url,data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()



        # Finding each events
        page_soup = soup(page_html, "html.parser")
        # Finding each events
        div = page_soup.findAll('article',{"class":"col-sm-1-3 col-md-1-3 col-lg-4 item item-event"})
        for container in div:
            
            title = container.h2.text
            a_tags=container.findAll('a')
            image='https://www.poetryireland.ie'+ a_tags[0].img['src']
            read_more= a_tags[0]['href']
            print(read_more)
            div_tags=container.findAll('div')
            date=div_tags[2].text.strip('\n\t')
    
            time_tag=date.split(',')
            time=time_tag[1]
            date=time_tag[0]
    
            #date formatting
            newdate = date.split(' ')
   
            date = newdate[1]
            month = newdate[2]
            monthTemp = month
            month = datetime.datetime.strptime(month,'%b').strftime('%B')
            year = datetime.datetime.now().year
            date = date + ' '+ month+' '+ year.__str__()

            d1 = datetime.datetime(int(year),int(month_string_to_number(monthTemp)),int(newdate[1]))

            location=div_tags[3].text
            location = location +(',')+ "Dublin"

            ordinates = getOrdinates(location)

            if str(ordinates) == 'Dublin':
                ordinates = getOrdinates("Dublin")  
            
            read_more=a_tags[1]['href']
            Category='FASHION, ART & THEATRE'

            if d1>datetime.datetime.now():
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.time = time
                data.location = location
                data.summary = ''
                data.img = image
                data.category = Category
                data.startdate = date
                data.read_more = read_more
                data.address = ordinates[2]
                data.latitude = ordinates[0]
                data.longitude = ordinates[1] 
                data.enddate = ''
                data.price = ''
                data_list.append(data)

        print(len(data_list))

        return data_list
Exemple #14
0
    def scrape(urlOriginal, data_list):
        for value in range(1, 5):
            url = ""
            url = urlOriginal + format(value) + '/'
            print(url)
            uClient = uReq(url)
            page_html = uClient.read()
            uClient.close()

            # Finding each events
            page_soup = soup(page_html, "html.parser")
            # Finding each events
            div = page_soup.findAll('article',
                                    {"class": "article whatsonarticle"})

            for container in div:

                Title = container.h3.text.strip()
                try:
                    image = container.a.img['src']
                except:
                    image = 'None'
                category = container.h5.text.strip()
                if category == 'Activities, Fashion' or category == 'Activities, Art, Workshop' or category == 'Art, Exhibition' or category == 'Activities, Christmas, Family, Theatre' or category == 'Activities, Theatre':
                    category = 'FASHION, ART & THEATRE'
                    if category == 'FASHION, ART & THEATRE' and image == 'None':
                        image = 'https://4.bp.blogspot.com/-haQkpIywgPA/W5L1p-6P5JI/AAAAAAAANv4/279R0n1im_MugfsnYTlbf5ZiTaG2s7NYQCLcBGAs/s1600/Six_photoby_IdilSukan_18.jpg'
                elif category == 'Activities, Drinks, Family, Food And Drink,Nightlife,Talks, Workshop' or category == 'Food And Drink' or category == 'Drinks, Food And Drink, History, Tours' or category == 'Drinks, History, Tours' or category == 'Drinks, Food And Drink' or category == 'Drinks' or category == 'Activities, Drinks, Family, Food And Drink, Nightlife, Talks, Workshop' or category == 'Drinks' or category == 'Culture, Food And Drink':
                    category = 'FOOD & DRINK'
                    if category == 'FOOD & DRINK' and image == 'None':
                        image = 'https://anandipaliwal.files.wordpress.com/2015/06/food-table-relisted.jpg'
                elif category == 'Activities, Culture, Exhibition':
                    category = 'COMMUNITY & FESTIVALS'
                    if category == 'COMMUNITY & FESTIVALS' and image == 'None':
                        image = 'https://www.totallydublin.ie/wp-content/uploads/2017/07/Just-Eat-Street.jpg'
                elif category == 'Tours' or category == 'Music, Tours' or category == 'Culture, Tours' or category == 'Music, Nightlife' or category == 'Music' or category == 'Activities, Art, Culture, Drinks, Fashion, Food And Drink, Free, Market' or category == 'Activities, Comedy' or category == 'Activities, Culture, Exhibition' or category == 'Activities' or category == 'Christmas, Music' or category == 'Activities, Christmas, Cinema, Music' or category == 'Activities, Family':
                    category = 'MUSIC & ENTERTAINMENT'
                    if category == 'MUSIC & ENTERTAINMENT' and image == 'None':
                        image = 'https://livestyle.com/wp-content/uploads/2017/07/slider-4.jpg'
                elif category == 'Theatre' or category == 'Fashion' or category == 'Culture, Theatre' or category == 'Halloween, Theatre' or category == 'Activities, Fashion' or category == 'Culture, Exhibition, Family, Food And Drink' or category == 'Activities, Art, Workshop' or category == 'Family, Theatre' or category == 'Activities, Halloween, Nightlife, Tours' or category == 'Beauty, Christmas':
                    category = 'FASHION, ART & THEATRE'
                    if category == 'FASHION, ART & THEATRE' and image == 'None':
                        image = 'https://4.bp.blogspot.com/-haQkpIywgPA/W5L1p-6P5JI/AAAAAAAANv4/279R0n1im_MugfsnYTlbf5ZiTaG2s7NYQCLcBGAs/s1600/Six_photoby_IdilSukan_18.jpg'
                elif category == '':
                    category = 'OTHERS'
                    if category == 'OTHERS' and image == 'None':
                        image = 'https://discuss.fm/images/max_topic_images/others.jpg'
                elif category == 'Beauty, Fitness':
                    category = 'SPORTS & HEALTH'
                    if category == 'SPORTS & HEALTH' and image == 'None':
                        image = 'https://previews.123rf.com/images/tnn103eda/tnn103eda1705/tnn103eda170500019/79377445-huge-multi-sports-collage-soccer-basketball-football-hockey-baseball-boxing-etc.jpg'
                else:
                    category = 'OTHERS'
                    if category == 'OTHERS' and image == 'None':
                        image = 'https://discuss.fm/images/max_topic_images/others.jpg'

                URL = container.a['href']

                date = container.cite.text.strip('\n\t\t')
                split_date = date.split('-')
                start_date = split_date[0]

                # date formatting for start_Date
                format_date = start_date.split(' ')
                date = format_date[0]
                month = format_date[1]
                year = format_date[2]
                monthfull = datetime.datetime.strptime(month,
                                                       '%b').strftime('%B')
                start_date = date + ' ' + monthfull + ' ' + year
                d1 = datetime.datetime(int(year),
                                       int(month_string_to_number(monthfull)),
                                       int(date))
                start_date = start_date.strip('\t\t')

                try:
                    end_date = split_date[1]
                except:
                    end_date = 'None'
        # date formatting for end_date
                if end_date is not 'None':
                    format_date = end_date.split(' ')
                    date = format_date[1]
                    month = format_date[2]
                    year = format_date[3]
                    monthfull = datetime.datetime.strptime(month,
                                                           '%b').strftime('%B')
                    end_date = date + ' ' + monthfull + ' ' + year
                    d1 = datetime.datetime(
                        int(year), int(month_string_to_number(monthfull)),
                        int(date))

        # date formatting for end_date

                a_tags = container.div.findAll('a')

                location = a_tags[2].text
                location = location.split('|')
                location = location[0]
                if location == 'The Grafton Quarter' or location == 'The Grafton Quarter Dublin':
                    location = 'The Grafton street'
                elif location == 'Dublin One':
                    location = 'Parnell street'
                else:
                    location = location
                location = location + (' ') + "Dublin"

                ordinates = getOrdinates(location)
                if str(ordinates) == 'None':
                    ordinates = getOrdinates("Dublin")

                if d1 > datetime.datetime.now():
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = Title
                    data.time = ''
                    data.location = location
                    data.summary = ''
                    data.img = image
                    data.category = category
                    data.startdate = start_date
                    data.read_more = URL
                    data.address = ordinates[2]
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.enddate = end_date
                    data.price = ''
                    data_list.append(data)

        print(len(data_list))

        return data_list
Exemple #15
0
    def scrape(urlOriginal, data_list):
        for value in range(1, 11):
            url = ""
            url = urlOriginal + format(value) + '/'
            print(url)
            uClient = uReq(url)
            page_html = uClient.read()
            uClient.close()
            # Finding each events
            page_soup = soup(page_html, "html.parser")

            # Finding each events
            article = page_soup.findAll('li',
                                        class_='sfnewsListItem sflistitem')

            item = page_soup.findAll('li',
                                     class_='sfrelatedListItem sflistitem')
            readmore = page_soup.findAll('div', class_='NewsEvent_right')

            for container in article:
                title = container.a.text
                div_tags = container.findAll('div')
                date = div_tags[0].text.strip()

                #date formatting
                new = date.split(',')
                year = new[2]
                new1 = new[1].split(' ')
                Date = new1[2]
                month = new1[1]
                fulldate = Date + '' + month + '' + year
                summary = div_tags[1].text.strip()
                location = "Dublin Business School, Dublin"
                category = "EDUCATION, BUSINESS & TECHNOLOGY"
                img = item[0].a.img['src']

                read_more = readmore[0].a['href']
                read_more = 'https://www.dbs.ie/about-dbs/news-and-events/' + read_more

                monthInt: int = month_string_to_number(month)

                d1 = datetime(int(year), monthInt, int(Date))
                d2 = datetime.now()

                try:
                    ordinates = getOrdinates(location)
                    # if str(ordinates) == 'Dublin':
                    #     ordinates = getOrdinates("Dublin")
                except:
                    continue

                if d1 > d2:
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = title
                    data.startdate = fulldate
                    data.enddate = ''
                    data.time = ''
                    data.category = category
                    data.price = ''
                    data.summary = summary
                    data.address = ordinates[2]
                    data.location = location
                    data.img = img
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.read_more = read_more
                    data_list.append(data)

        print(len(data_list))
        return data_list
    def scrape(url, data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()

        # Finding each events
        page_soup = soup(page_html, "html.parser")

        # Finding each events
        events_today = page_soup.find_all('div',
                                          class_="calendarEventsBlockWrap")
        events = events_today[0].find_all('div', class_="event-item clearfix")
        #print(events_today)

        for event in events:
            title = event.h3.a["title"]
            #print(title)
            read_more = event.find_all(
                'a', {"class": "calendarImageLink"})[0]["data-link-url"]
            try:
                image = event.find_all(
                    'a', {"class": "calendarImageLink"})[0]['href']
            #         print(image)
            except:
                image = 'None'
            if image.endswith(".jpg") | image.endswith(".png"):
                image = image
            else:
                image = 'None'

            date = event.em.text

            start_date = date.split('through')[0].strip()
            try:
                Year1 = start_date.split(',')[1].strip()
            except:
                Year1 = 'None'
            Date1 = start_date.split(',')[0].split(' ')[1]
            Month1 = start_date.split(',')[0].split(' ')[2]
            start_date = Date1 + " " + Month1 + " " + Year1

            end_date = date.split('through')[1].strip()
            Year2 = end_date.split(',')[1].strip()
            Date2 = end_date.split(',')[0].split(' ')[1]
            Month2 = end_date.split(',')[0].split(' ')[2]
            end_date = Date2 + " " + Month2 + " " + Year2
            print(end_date)
            d1 = datetime(int(Year2.split('.')[0]),
                          int(month_string_to_number(Month2)), int(Date2))
            d2 = datetime.now()
            # # #        description
            desc = event.div.text.strip()

            # # #       Time,location,price,address

            header = [elem.next for elem in event.find_all('strong')]
            header = header[1:]
            #     print(header)
            values = [elem.next.next for elem in event.find_all('strong')]
            values = values[1:]
            #     print(values)
            j = 0
            my_dict = {}
            for i in header:
                my_dict[i] = values[j].strip()
                j = j + 1
            category = my_dict["Category:"]
            if category == 'Arts / Exhibits' or category == 'Comedy' or category == 'Theatre / Dance':
                category = 'FASHION, ART & THEATRE'
                if category == 'FASHION, ART & THEATRE' and image == 'None':
                    image = 'https://4.bp.blogspot.com/-haQkpIywgPA/W5L1p-6P5JI/AAAAAAAANv4/279R0n1im_MugfsnYTlbf5ZiTaG2s7NYQCLcBGAs/s1600/Six_photoby_IdilSukan_18.jpg'
            elif category == 'Business Networking' or category == 'Canvention / Conference' or category == 'Educational' or category == 'Expo':
                category = 'EDUCATION, BUSINESS & TECHNOLOGY'
                if category == 'EDUCATION, BUSINESS & TECHNOLOGY' and image == 'None':
                    image = 'https://uindia.net/assets/img/MediaTechnology.jpg'
            elif category == 'Concert / Live Music' or category == 'Cultutal' or category == 'Entertainment':
                category = 'MUSIC & ENTERTAINMENT'
                if category == 'MUSIC & ENTERTAINMENT' and image == 'None':
                    image = 'https://livestyle.com/wp-content/uploads/2017/07/slider-4.jpg'
            elif category == 'Festival' or category == 'Kids / Family':
                category = 'COMMUNITY & FESTIVALS'
                if category == 'COMMUNITY & FESTIVALS' and image == 'None':
                    image = 'https://www.totallydublin.ie/wp-content/uploads/2017/07/Just-Eat-Street.jpg'
            elif category == ' Health' or category == 'Sports':
                category = 'SPORTS & HEALTH'
                if category == 'SPORTS & HEALTH' and image == 'None':
                    image = 'https://previews.123rf.com/images/tnn103eda/tnn103eda1705/tnn103eda170500019/79377445-huge-multi-sports-collage-soccer-basketball-football-hockey-baseball-boxing-etc.jpg'

            if d1 > d2:
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.img = image
                data.startdate = start_date
                data.enddate = end_date
                data.summary = desc
                data.time = my_dict["Time:"]
                data.location = my_dict["Location:"] + my_dict["Address:"]
                #data.address = my_dict["Address:"]
                data.read_more = read_more
                try:
                    data.price = my_dict["Price:"]
                except:
                    data.price = "SEE DESCRIPTION"
                data.category = category
                data_list.append(data)

        print(len(data_list))
        return data_list