Ejemplo n.º 1
0
    def scrape(url, data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()

        # Finding each events
        page_soup = soup(page_html, "html.parser")

        # Finding each events
        div = page_soup.findAll('div', {"class": "wrapBlock wrapBlockNews"})

        for container in div:
            # container = div[0]
            title = container.h2.text
            date = container.em.text.strip('()')
            date = date.split('/')
            dateArr = date
            month = date[1]
            # date format change
            month = datetime.datetime.strptime(month, '%m').strftime('%B')
            #date = date[0] + (' ') + month + (' ') + date[2]
            date = date[2] + '' + month + '' + date[0]

            d1 = datetime.datetime(int(dateArr[2]),
                                   int(month_string_to_number(month)),
                                   int(dateArr[0]))
            d2 = datetime.datetime.now()
            #wh = container.p.text
            #wh = str(wh)
            location = 'Whitehall College of Further Education, Drumcondra, Dublin'

            ordinates = getOrdinates(location)
            read_more = container.a['href']
            Category = 'EDUCATION, BUSINESS & TECHNOLOGY'
            img = 'https://whitehallcollege.com/uploads/pages/logo.jpg'

            if d1 > d2:
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.startdate = date
                data.enddate = ''
                data.time = ''
                data.category = Category
                data.price = ''
                data.summary = ''
                data.location = location
                data.address = ordinates[2]
                data.latitude = ordinates[0]
                data.longitude = ordinates[1]
                data.img = img
                data.read_more = read_more
                data_list.append(data)

        print(len(data_list))
        return data_list
Ejemplo n.º 2
0
    def scrape(urlOriginal, data_list):

        i = 0
        #3
        for value in range(1, 3):
            url = ""
            url = urlOriginal + format(value)
            print(url)
            try:
                uClient = uReq(url)
            except:
                pass
            page_html = uClient.read()
            uClient.close()
            #Parsing
            page_soup = soup(page_html, "html.parser")
            #article = page_soup.findAll('ul',class_='search-main-content__events-list')
            article_1 = page_soup.findAll('div',
                                          class_='search-event-card-wrapper')

            # fetching each details
            for container in article_1:
                title = container.findAll(
                    'div', class_='eds-event-card__formatted-name--is-clamped'
                )[0].text

                try:
                    Date_time = container.findAll(
                        'div',
                        class_=
                        'eds-text-color--primary-brand eds-l-pad-bot-1 eds-text-weight--heavy eds-text-bs'
                    )[0].text
                except:
                    Date_time = 'None'
                # try:
                #     Location = container.findAll('div',class_='card-text--truncated__one')[0].text
                # except:
                #     Location='None'
                try:
                    Price = container.findAll(
                        'div',
                        class_=
                        'eds-media-card-content__sub eds-text-bm eds-text-color--grey-600 eds-l-mar-top-1 eds-media-card-content__sub--cropped'
                    )[1].text
                except:
                    Price = 'None'
                a_tags = container.findAll('a')
                try:
                    image = a_tags[0].img['src']
                except:
                    image = 'None'
                read_more = a_tags[0]['href']
                print(read_more)

                category = 'COMMUNITY & FESTIVALS'
                if category == 'COMMUNITY & FESTIVALS' and image == 'None':
                    image = 'https://www.totallydublin.ie/wp-content/uploads/2017/07/Just-Eat-Street.jpg'

                # description

                descurl = read_more
                #Opening connection , grabbing the page
                try:
                    uClient = uReq(descurl)
                except:
                    pass
                desc_html = uClient.read()
                uClient.close()
                #Parsing
                desc_soup = soup(desc_html, "html.parser")

                desc = desc_soup.findAll(
                    'div', class_='js-xd-read-more-contents l-mar-top-3'
                ) or desc_soup.findAll(
                    'div',
                    class_=
                    'structured-content-rich-text structured-content__module l-align-left l-mar-vert-6 l-sm-mar-vert-4 text-body-medium'
                )
                if len(desc) > 0:
                    try:
                        p_tags = desc[0].findAll('p')
                    except:
                        continue

                    descrip = []
                    for i in range(len(p_tags)):
                        descript = p_tags[i].text
                        descrip.append(descript)
                    description = ''.join(str(e) for e in descrip)
                else:
                    description = 'None'

                # Date fetching and formatting

                time = desc_soup.findAll('time', class_='clrfix')
                if len(time) > 0:
                    time_tags = time[0].findAll('p')
                    date_check = time_tags[0].text

                    if date_check == 'Multiple Dates' or date_check == 'Multiple Dates GMT' or date_check == 'Multiple Dates IST':
                        Final_Date = date_check

                    else:
                        Date_time = date_check.split(',')

                        if (len(Date_time)) == 2:
                            Final_Date = Date_time[1].strip(' ')

                        else:
                            Mon_Date = Date_time[1].split(' ')
                            if len(Mon_Date) == 3:
                                Date = Mon_Date[2]
                                month = Mon_Date[1]
                                if len(month) <= 3:
                                    Month = datetime.datetime.strptime(
                                        month, '%b').strftime('%B')
                                else:
                                    Month = month
                                year = Date_time[2]
                                Final_Date = Date + (' ') + Month + year

                            elif len(Mon_Date) == 4:
                                Date = Mon_Date[1]
                                month = Mon_Date[2]
                                Month = datetime.datetime.strptime(
                                    month, '%b').strftime('%B')
                                year = Mon_Date[3]
                                Final_Date = Date + (' ') + Month + (
                                    ' ') + year

                else:
                    Final_Date = 'None'

                #location fetching
                location_div = desc_soup.findAll('div',
                                                 class_='event-details__data')
                if len(location_div) > 0:
                    location_tags = location_div[1].findAll('p')
                    locat = location_tags[0].text
                    location = locat + (' ') + "Dublin"
                else:
                    location = 'Dublin'

                print(location)

                try:

                    if location == 'Dublin':
                        ordinates[2] = "The Spire,North City,Dublin"
                        ordinates[0] = 53.3498091
                        ordinates[1] = -6.2602548

                    else:
                        ordinates = getOrdinates(location)

                except:
                    continue

                try:
                    d1 = datetime.datetime(int(year),
                                           int(month_string_to_number(Month)),
                                           int(Date))
                except:
                    continue

                d2 = datetime.datetime.now()

                if d1 > d2:
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = title
                    data.time = ''
                    data.location = location
                    data.summary = description
                    data.img = image
                    data.category = category
                    data.address = ordinates[2]
                    data.startdate = Final_Date
                    data.read_more = read_more
                    data.enddate = ''
                    data.price = Price
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data_list.append(data)
                    i = i + 1

            # print(len(data))

        print(len(data_list))
        return data_list
Ejemplo n.º 3
0
    def scrape(url,data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()



        # Finding each events
        page_soup = soup(page_html, "html.parser")
        # Finding each events
        div = page_soup.findAll('div',{"class":"grid-item"})
        for container in div:
            try:
                url=container.a['href']
            except:
                url=''
            try:
                image=container.a.img['src']
            except:
                image='None'
            title = container.h3.text
            date=container.h2.text
            #date formatting
            datesplit = date.split('/')
            date_month_year = datesplit[0].split('.')
            date = date_month_year[0].split(' ')
            date = date[1]
            dateTemp = date
            month = date_month_year[1]
            monthTemp = month
            month = datetime.datetime.strptime(month,'%m').strftime('%B')
            year = date_month_year[2]
            date = date + ' '+ month+' '+ year

            d1 = datetime.datetime(int(year), int(monthTemp), int(dateTemp))
            d2 = datetime.datetime.now()

            div_tags = container.findAll('div')
            time = datesplit[1]
            div_tags = container.findAll('div')
            price = div_tags[1].p.text
            location = "Sugar Club, Leeson Street, Dublin"

            
                
            ordinates = getOrdinates(location)
            

            category = "MUSIC & ENTERTAINMENT"
            if category == 'MUSIC & ENTERTAINMENT' and image == 'None':
                image = 'https://livestyle.com/wp-content/uploads/2017/07/slider-4.jpg'

            if d1>d2:
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.time = time
                data.location = location
                data.summary = ''
                data.img = image
                data.category = category
                data.startdate = date
                data.read_more = url
                data.address = ordinates[2]
                data.latitude = ordinates[0]
                data.longitude = ordinates[1] 
                data.enddate = ''
                data.price = price
                data_list.append(data)

        print(len(data_list))

        return data_list
Ejemplo n.º 4
0
    def scrape(urlOriginal, data_list):
        data_list = []
        #136
        #100--added
        for value in range(1, 136):

            url = ""
            url = urlOriginal + format(value) + '/'
            print(url)
            try:
                uClient = uReq(url)
            except:
                pass
            page_html = uClient.read()
            uClient.close()
            # Finding each events
            page_soup = soup(page_html, "html.parser")
            det = page_soup.findAll('div', class_='details')
            for container in det:
                try:
                    title = container.h2.text
                except:
                    title = 'None'

                location = container.h3.text
                try:
                    description = container.p.text.strip('\n')
                    description = description.strip(' ')
                except:
                    description = 'None'
                try:
                    img = container.img['src']
                except:
                    img = 'None'

                read_more = container.a['href']
                read = 'https://www.ireland.com/' + read_more
                category = 'TOURISM & SIGHTSEEING'
                if category == 'TOURISM & SIGHTSEEING' and img == 'None':
                    img = 'https://www.fhi.no/globalassets/bilder/vaksine/oversikt-reisevaksine.jpg?preset=mainbodywidth'

                print(location)

                if location == 'Dublin':
                    ordinates[2] = "The Spire,North City,Dublin"
                    ordinates[0] = 53.3498091
                    ordinates[1] = -6.2602548
                else:
                    ordinates = getOrdinates(location)

                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.time = ''
                data.location = location
                data.summary = description
                data.img = img
                data.category = category
                data.startdate = ''
                data.read_more = read
                data.address = ordinates[2]
                data.latitude = ordinates[0]
                data.longitude = ordinates[1]
                data.enddate = ''
                data.price = ''
                data_list.append(data)

        print(len(data_list))

        return data_list
Ejemplo n.º 5
0
    def scrape(urlOriginal, data_list):
        #4
        for value in range(1, 4):
            url = ""
            url = urlOriginal + format(value)
            print(url)

            uClient = uReq(url)
            page_html = uClient.read()
            uClient.close()

            page_soup = soup(page_html, "html.parser")
            div = page_soup.find_all('li',
                                     {"class": "wrapped wrapped-borderless"})

            for container in div:
                Title = container.h3.text.strip()
                URL = 'https://www.rcsi.com' + container.a['href']
                date = container.div.text.replace("\n", " ")

                # date formatting
                date_split = date.split(' ')
                date = date_split[1]
                if "-" in date:
                    datecheck = date.split('-')
                    date = datecheck[0]
                    enddate1 = datecheck[1]
                month = date_split[2]
                month = datetime.datetime.strptime(month, '%b').strftime('%B')
                year = datetime.datetime.now().year
                date1 = date + ' ' + month + ' ' + year.__str__()
                d1 = datetime.datetime(int(year),
                                       int(month_string_to_number(month)),
                                       int(date))

                try:
                    enddate = enddate1 + ' ' + month + ' ' + year.__str__()
                    d1 = datetime.datetime(int(year),
                                           int(month_string_to_number(month)),
                                           int(enddate1))
                except:
                    enddate = 'None'

                d2 = datetime.datetime.now()

                Location = container.p.text.strip('\n')

                try:

                    if Location == 'Dublin':
                        ordinates[2] = "The Spire,North City,Dublin"
                        ordinates[0] = 53.3498091
                        ordinates[1] = -6.2602548

                    else:
                        ordinates = getOrdinates(Location)

                except:
                    continue

                ordinates = getOrdinates(Location)
                if str(ordinates) == 'Dublin':
                    ordinates = getOrdinates("Dublin")

                Category = 'EDUCATION, BUSINESS & TECHNOLOGY'
                img = 'http://www.hrbcentreprimarycare.ie/images/rcsilogonewer.png'

                if d1 > d2:
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = Title
                    data.time = ''
                    data.location = Location
                    data.summary = ''
                    data.img = img
                    data.category = Category
                    data.startdate = date1
                    data.read_more = URL
                    data.address = ordinates[2]
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.enddate = enddate
                    data.price = ''
                    data_list.append(data)

        print(len(data_list))

        return data_list
Ejemplo n.º 6
0
    def scrape(url, data_list):

        for value in range(1, 3):
            url = "https://www.knowledgetransferireland.com/Events/Upcoming-Events/?pageNumber={}".format(
                value)
            #url = url.format(value)
            print(url)
            try:
                uClient = uReq(url)
            except:
                pass
            page_html = uClient.read()
            uClient.close()
            page_soup = soup(page_html, "html.parser")
            div = page_soup.findAll('div', {"class": "each-item"})

            #final_list = []

            for container in div:
                span_tags = container.div.findAll('span')
                p_tags = container.findAll('p')
                date1 = span_tags[0].text
                date2 = span_tags[1].text
                month = date2
                date2 = datetime.datetime.strptime(date2, '%b').strftime('%B')
                date3 = span_tags[2].text
                startdate = date1 + ' ' + date2 + ' ' + date3
                d1 = datetime.datetime(int(date3),
                                       int(month_string_to_number(month)),
                                       int(date1))
                category = 'EDUCATION, BUSINESS & TECHNOLOGY'
                try:
                    title = container.h2.text
                except:
                    title = 'none'
                read_more = container.h2.a['href']
                read_more = 'https://www.knowledgetransferireland.com/' + read_more
                try:
                    description = container.p.text
                except:
                    description = 'none'
                try:
                    location = p_tags[2].text
                except:
                    location = 'Dublin'
                location = location + "Dublin"

                ordinates = getOrdinates(location)

                img = 'https://uindia.net/assets/img/MediaTechnology.jpg'

                if d1 > datetime.datetime.now():
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = title
                    data.time = ''
                    data.location = location
                    data.summary = description
                    data.img = img
                    data.category = category
                    data.startdate = startdate
                    data.read_more = read_more
                    data.address = ordinates[2]
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.enddate = ''
                    data.price = ''
                    data_list.append(data)

        return data_list
Ejemplo n.º 7
0
    def scrape(urlOriginal, data_list):
        #49
        #27
        for value in range(1, 27):
            url = ""
            url = urlOriginal + format(value)
            print(url)
            try:
                JSONContent = requests.get(url).json()
            except:
                pass
            content = json.dumps(JSONContent, indent=4, sort_keys=True)
            #print(content)
            data = json.loads(content)

            var1 = data['_embedded']

            for var1 in var1['events']:
                Title = var1["name"]
                URL = var1["url"]
                date1 = var1['dates']['start']['localDate']
                date1 = date1.split('-')
                month = date1[1]
                date2 = datetime.datetime.strptime(month, '%m').strftime('%B')
                date = date1[2] + ' ' + date2 + ' ' + date1[0]
                d1 = datetime.datetime(int(date1[0]), int(date1[1]),
                                       int(date1[2]))
                d2 = datetime.datetime.now()
                try:
                    Time = var1['dates']['start']['localTime']
                except:
                    Time = ''
                try:
                    Address_Line_1 = var1['_embedded']['venues'][0]['address'][
                        'line1']
                except:
                    Address_Line_1 = ''
                try:
                    Address_Line_2 = var1['_embedded']['venues'][0]['address'][
                        'line2']
                except:
                    Address_Line_2 = ''
                try:
                    Postal_Code = var1['_embedded']['venues'][0]['postalCode']
                except:
                    Postal_Code = ''
                img = var1['images'][2]['url']
                category = var1['classifications'][0]['segment']['name']
                if category == 'Arts & Theatre' or category == '':
                    category = 'FASHION, ART & THEATRE'
                elif category == 'Sport' or category == 'Sports':
                    category = 'SPORTS & HEALTH'
                elif category == 'Family & Attractions':
                    category = 'COMMUNITY & FESTIVALS'
                elif category == 'Music':
                    category = 'MUSIC & ENTERTAINMENT'
                else:
                    category = 'OTHERS'
                #Subcategory =var1['classifications'][0]['genre']['name']
                Venue = var1['_embedded']['venues'][0]['name']
                #Location = Venue+(' ')+ Address_Line_1 +(' ')+ Address_Line_2 +(' ')+ Postal_Code
                Location = Venue + "Dublin"

                try:

                    if Location == 'Dublin':
                        ordinates[2] = "The Spire,North City,Dublin"
                        ordinates[0] = 53.3498091
                        ordinates[1] = -6.2602548

                    else:
                        ordinates = getOrdinates(Location)

                except:
                    continue

                if d1 > d2:
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = Title
                    data.time = Time
                    data.location = Location
                    data.summary = ''
                    data.img = img
                    data.category = category
                    data.startdate = date
                    data.read_more = URL
                    data.address = ordinates[2]
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.enddate = ''
                    data.price = ''
                    data_list.append(data)

        print(len(data_list))

        return data_list
    def scrape(url,data_list):

        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()



        # Finding each events
        page_soup = soup(page_html, "html.parser")
        # Finding each events
        div = page_soup.findAll('article',{"class":"col-sm-1-3 col-md-1-3 col-lg-4 item item-event"})
        for container in div:
            
            title = container.h2.text
            a_tags=container.findAll('a')
            image='https://www.poetryireland.ie'+ a_tags[0].img['src']
            read_more= a_tags[0]['href']
            print(read_more)
            div_tags=container.findAll('div')
            date=div_tags[2].text.strip('\n\t')
    
            time_tag=date.split(',')
            time=time_tag[1]
            date=time_tag[0]
    
            #date formatting
            newdate = date.split(' ')
   
            date = newdate[1]
            month = newdate[2]
            monthTemp = month
            month = datetime.datetime.strptime(month,'%b').strftime('%B')
            year = datetime.datetime.now().year
            date = date + ' '+ month+' '+ year.__str__()

            d1 = datetime.datetime(int(year),int(month_string_to_number(monthTemp)),int(newdate[1]))

            location=div_tags[3].text
            location = location +(',')+ "Dublin"

            ordinates = getOrdinates(location)

            if str(ordinates) == 'Dublin':
                ordinates = getOrdinates("Dublin")  
            
            read_more=a_tags[1]['href']
            Category='FASHION, ART & THEATRE'

            if d1>datetime.datetime.now():
                data = EventData()

                data.id = uuid.uuid1().__str__()
                data.title = title
                data.time = time
                data.location = location
                data.summary = ''
                data.img = image
                data.category = Category
                data.startdate = date
                data.read_more = read_more
                data.address = ordinates[2]
                data.latitude = ordinates[0]
                data.longitude = ordinates[1] 
                data.enddate = ''
                data.price = ''
                data_list.append(data)

        print(len(data_list))

        return data_list
Ejemplo n.º 9
0
    def scrape(urlOriginal, data_list):
        for value in range(1, 5):
            url = ""
            url = urlOriginal + format(value) + '/'
            print(url)
            uClient = uReq(url)
            page_html = uClient.read()
            uClient.close()

            # Finding each events
            page_soup = soup(page_html, "html.parser")
            # Finding each events
            div = page_soup.findAll('article',
                                    {"class": "article whatsonarticle"})

            for container in div:

                Title = container.h3.text.strip()
                try:
                    image = container.a.img['src']
                except:
                    image = 'None'
                category = container.h5.text.strip()
                if category == 'Activities, Fashion' or category == 'Activities, Art, Workshop' or category == 'Art, Exhibition' or category == 'Activities, Christmas, Family, Theatre' or category == 'Activities, Theatre':
                    category = 'FASHION, ART & THEATRE'
                    if category == 'FASHION, ART & THEATRE' and image == 'None':
                        image = 'https://4.bp.blogspot.com/-haQkpIywgPA/W5L1p-6P5JI/AAAAAAAANv4/279R0n1im_MugfsnYTlbf5ZiTaG2s7NYQCLcBGAs/s1600/Six_photoby_IdilSukan_18.jpg'
                elif category == 'Activities, Drinks, Family, Food And Drink,Nightlife,Talks, Workshop' or category == 'Food And Drink' or category == 'Drinks, Food And Drink, History, Tours' or category == 'Drinks, History, Tours' or category == 'Drinks, Food And Drink' or category == 'Drinks' or category == 'Activities, Drinks, Family, Food And Drink, Nightlife, Talks, Workshop' or category == 'Drinks' or category == 'Culture, Food And Drink':
                    category = 'FOOD & DRINK'
                    if category == 'FOOD & DRINK' and image == 'None':
                        image = 'https://anandipaliwal.files.wordpress.com/2015/06/food-table-relisted.jpg'
                elif category == 'Activities, Culture, Exhibition':
                    category = 'COMMUNITY & FESTIVALS'
                    if category == 'COMMUNITY & FESTIVALS' and image == 'None':
                        image = 'https://www.totallydublin.ie/wp-content/uploads/2017/07/Just-Eat-Street.jpg'
                elif category == 'Tours' or category == 'Music, Tours' or category == 'Culture, Tours' or category == 'Music, Nightlife' or category == 'Music' or category == 'Activities, Art, Culture, Drinks, Fashion, Food And Drink, Free, Market' or category == 'Activities, Comedy' or category == 'Activities, Culture, Exhibition' or category == 'Activities' or category == 'Christmas, Music' or category == 'Activities, Christmas, Cinema, Music' or category == 'Activities, Family':
                    category = 'MUSIC & ENTERTAINMENT'
                    if category == 'MUSIC & ENTERTAINMENT' and image == 'None':
                        image = 'https://livestyle.com/wp-content/uploads/2017/07/slider-4.jpg'
                elif category == 'Theatre' or category == 'Fashion' or category == 'Culture, Theatre' or category == 'Halloween, Theatre' or category == 'Activities, Fashion' or category == 'Culture, Exhibition, Family, Food And Drink' or category == 'Activities, Art, Workshop' or category == 'Family, Theatre' or category == 'Activities, Halloween, Nightlife, Tours' or category == 'Beauty, Christmas':
                    category = 'FASHION, ART & THEATRE'
                    if category == 'FASHION, ART & THEATRE' and image == 'None':
                        image = 'https://4.bp.blogspot.com/-haQkpIywgPA/W5L1p-6P5JI/AAAAAAAANv4/279R0n1im_MugfsnYTlbf5ZiTaG2s7NYQCLcBGAs/s1600/Six_photoby_IdilSukan_18.jpg'
                elif category == '':
                    category = 'OTHERS'
                    if category == 'OTHERS' and image == 'None':
                        image = 'https://discuss.fm/images/max_topic_images/others.jpg'
                elif category == 'Beauty, Fitness':
                    category = 'SPORTS & HEALTH'
                    if category == 'SPORTS & HEALTH' and image == 'None':
                        image = 'https://previews.123rf.com/images/tnn103eda/tnn103eda1705/tnn103eda170500019/79377445-huge-multi-sports-collage-soccer-basketball-football-hockey-baseball-boxing-etc.jpg'
                else:
                    category = 'OTHERS'
                    if category == 'OTHERS' and image == 'None':
                        image = 'https://discuss.fm/images/max_topic_images/others.jpg'

                URL = container.a['href']

                date = container.cite.text.strip('\n\t\t')
                split_date = date.split('-')
                start_date = split_date[0]

                # date formatting for start_Date
                format_date = start_date.split(' ')
                date = format_date[0]
                month = format_date[1]
                year = format_date[2]
                monthfull = datetime.datetime.strptime(month,
                                                       '%b').strftime('%B')
                start_date = date + ' ' + monthfull + ' ' + year
                d1 = datetime.datetime(int(year),
                                       int(month_string_to_number(monthfull)),
                                       int(date))
                start_date = start_date.strip('\t\t')

                try:
                    end_date = split_date[1]
                except:
                    end_date = 'None'
        # date formatting for end_date
                if end_date is not 'None':
                    format_date = end_date.split(' ')
                    date = format_date[1]
                    month = format_date[2]
                    year = format_date[3]
                    monthfull = datetime.datetime.strptime(month,
                                                           '%b').strftime('%B')
                    end_date = date + ' ' + monthfull + ' ' + year
                    d1 = datetime.datetime(
                        int(year), int(month_string_to_number(monthfull)),
                        int(date))

        # date formatting for end_date

                a_tags = container.div.findAll('a')

                location = a_tags[2].text
                location = location.split('|')
                location = location[0]
                if location == 'The Grafton Quarter' or location == 'The Grafton Quarter Dublin':
                    location = 'The Grafton street'
                elif location == 'Dublin One':
                    location = 'Parnell street'
                else:
                    location = location
                location = location + (' ') + "Dublin"

                ordinates = getOrdinates(location)
                if str(ordinates) == 'None':
                    ordinates = getOrdinates("Dublin")

                if d1 > datetime.datetime.now():
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = Title
                    data.time = ''
                    data.location = location
                    data.summary = ''
                    data.img = image
                    data.category = category
                    data.startdate = start_date
                    data.read_more = URL
                    data.address = ordinates[2]
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.enddate = end_date
                    data.price = ''
                    data_list.append(data)

        print(len(data_list))

        return data_list
Ejemplo n.º 10
0
    def scrape(urlOriginal, data_list):
        for value in range(1, 11):
            url = ""
            url = urlOriginal + format(value) + '/'
            print(url)
            uClient = uReq(url)
            page_html = uClient.read()
            uClient.close()
            # Finding each events
            page_soup = soup(page_html, "html.parser")

            # Finding each events
            article = page_soup.findAll('li',
                                        class_='sfnewsListItem sflistitem')

            item = page_soup.findAll('li',
                                     class_='sfrelatedListItem sflistitem')
            readmore = page_soup.findAll('div', class_='NewsEvent_right')

            for container in article:
                title = container.a.text
                div_tags = container.findAll('div')
                date = div_tags[0].text.strip()

                #date formatting
                new = date.split(',')
                year = new[2]
                new1 = new[1].split(' ')
                Date = new1[2]
                month = new1[1]
                fulldate = Date + '' + month + '' + year
                summary = div_tags[1].text.strip()
                location = "Dublin Business School, Dublin"
                category = "EDUCATION, BUSINESS & TECHNOLOGY"
                img = item[0].a.img['src']

                read_more = readmore[0].a['href']
                read_more = 'https://www.dbs.ie/about-dbs/news-and-events/' + read_more

                monthInt: int = month_string_to_number(month)

                d1 = datetime(int(year), monthInt, int(Date))
                d2 = datetime.now()

                try:
                    ordinates = getOrdinates(location)
                    # if str(ordinates) == 'Dublin':
                    #     ordinates = getOrdinates("Dublin")
                except:
                    continue

                if d1 > d2:
                    data = EventData()

                    data.id = uuid.uuid1().__str__()
                    data.title = title
                    data.startdate = fulldate
                    data.enddate = ''
                    data.time = ''
                    data.category = category
                    data.price = ''
                    data.summary = summary
                    data.address = ordinates[2]
                    data.location = location
                    data.img = img
                    data.latitude = ordinates[0]
                    data.longitude = ordinates[1]
                    data.read_more = read_more
                    data_list.append(data)

        print(len(data_list))
        return data_list