Python TheMiner Beispiele, packets.TheMiner.TheMiner Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: artsy.py Projekt: SVwrite/Web_Scrapers

    def miner(self):
        # Miner's track : We land on artwork listings page. We pick the listings from there.
        # We pick the Sellers and Artists from artwork pages.
        # From artwork pages we fetch the artwork for sale for artists listed.

        self.get_artist_listings()
        # print(kazoart.artist_listings)
        # print(self.artist_listings)
        # print("ARTIST LISTINGS")
        # print(len(self.artist_listings))
        # time.sleep(10)

        # That the pages where we discarded the links can be visited as well
        for link in self.first_prod_list:
            visited.discard(link)

        self.get_artwork_listings_master()
        # get_artwork_listings_master -> get_artwork_listings_slave -> get_artist_data -> write_artist_data
        # So we're done with artist data.
        # print(f"ARTWORK LISTINGS, {len(self.artwork_listings)}")
        # print(len(self.artwork_listings))
        # time.sleep(10)

        self.get_artwork_data_master()

        # DATA COLLECTION COMPLETED FOR THIS MODULE.
        # DOWNLOADING IMAGES NOW.
        TheMiner.sir_image_manager()

Beispiel #2

0

Datei anzeigen

Datei: singulart.py Projekt: SVwrite/Web_Scrapers

    def miner(self):
        self.get_artist_listings()
        print(len(self.artist_listings))

        self.get_artwork_listings_master()
        # get_artwork_listings_master -> get_artwork_listings_slave -> get_artist_data -> write_artist_data
        # So we're done with artist data.
        print(len(self.artwork_listings))

        self.get_artwork_data_master()

        # DATA COLLECTION COMPLETED FOR THIS MODULE.
        # DOWNLOADING IMAGES NOW.
        TheMiner.sir_image_manager()

Beispiel #3

0

Datei anzeigen

Datei: singulart.py Projekt: SVwrite/Web_Scrapers

    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave and get_seller_id
        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        if soup is not None:

            # Seller's Name
            print(seller_name)
            # Code will break if seller's name is not found

            # Location
            try:
                print(location)
            except AttributeError:
                location = None

            # Website
            try:
                print(website)
            except AttributeError:
                website = None
            except TypeError:
                website = None

            bundle = [url, self.website.platform, seller_name, location, website]
            print(bundle)
            TheAuthour.write_seller(*bundle)

Beispiel #4

0

Datei anzeigen

Datei: kazoart.py Projekt: SVwrite/Web_Scrapers

        def recurrent(i_url, depth):
            soup = TheMiner.fetch_page(i_url)
            if soup is not None:
                product_list = soup.find('div', class_='product-list-wrapper')
                product_list = product_list.find_all('div', class_='grid-item')
                for product in product_list:
                    item_price = str(
                        product.find(
                            'div',
                            class_='grid-item-price').text).strip().upper()
                    # Discard the data that does not have a price.
                    if not item_price == "SOLD":
                        product_link = str(product.a['href']).strip()
                        # Discarding urls that do not take us to paintings and sculptures. (RULE : 1)
                        if "/sculpture/" in product_link or "/painting/" in product_link:
                            if product_link not in self.artwork_listings:
                                self.artwork_listings.append(product_link)

                # Get artist data if depth is "1", if depth is more than "1" ignore this block.
                # To pick the artist's data, from the first page of the listings.
                if depth == 1:
                    # Calling the function to fetch the artist data, and return artist_id
                    self.get_artist_data(soup, i_url)

                next_page = soup.find('div', class_='page-browser')
                if next_page is not None:
                    next_page = next_page.find('div',
                                               class_='page-browser-numbers')
                    next_page = next_page.find_all('a',
                                                   class_='page-browser-item ')
                    for next_ in next_page:
                        next_ = self.website.domain + str(next_['href'])
                        depth += 1
                        recurrent(next_page, depth)

Beispiel #5

0

Datei anzeigen

Datei: singulart.py Projekt: SVwrite/Web_Scrapers

    def key_maker(artist_url):
        visited.discard(artist_url)
        soup = TheMiner.fetch_page(artist_url)
        if soup is not None:
            artist_resume = soup.find('div', class_='artist-resume').find('div', class_='artist-resume_text')
            name = artist_resume.h1.text.strip()
            print(name)
            # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed

            if name is not None:
                try:
                    country = artist_resume.find('p', class_='location').text.strip().split('\n')
                    country = country[0].split(',')
                    country = country[-1].strip()
                    print(country)
                except AttributeError:
                    country = None

                about = soup.find('div', id='about').text.strip()
                # About will either be found and be some text or be None.
                # print(about)

                artist_data_pack = [name, None, country, about]
                key = db.Artist.key_maker(artist_data_pack)
                # pack = [name, born, country, about]
                return key

        else:
            return None

Beispiel #6

0

Datei anzeigen

Datei: artsper_new.py Projekt: SVwrite/Web_Scrapers

    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave
        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        # print("A")
        if soup is not None:
            # print("B")

            A = soup.find('div', id='top-seller')
            seller_name = A.h1.text.strip()
            # print(seller_name)
            # Code will break if seller's name is not found

            try:
                location = A.find('p',
                                  class_="subtitle").text.strip().split(',')
                location = location[-1].strip()
                # print(location)
            except AttributeError:
                location = None
            try:
                website = str(soup.find('ul', id="websites").a['href']).strip()
                # print(website)
            except AttributeError:
                website = None
            except TypeError:
                website = None

            bundle = [
                url, self.website.platform, seller_name, location, website
            ]
            # print(bundle)
            TheAuthour.write_seller(*bundle)

Beispiel #7

0

Datei anzeigen

Datei: singulart.py Projekt: SVwrite/Web_Scrapers

        def recurr(url):
            soup = TheMiner.fetch_page(url, ghost=True)
            if soup is not None:
                # Because singulart keeps blocking ips, we'll ship everything inside try-except statements.
                try:
                    # artist_blocks = soup.find_all('div', class_='artist-container')
                    artist_blocks = soup.find_all('figure', class_='pic-artist')
                    print(len(artist_blocks))
                    for artist in artist_blocks:
                        link = artist.figcaption.h2.a.get('href')
                        if self.website.domain not in link:
                            link = self.link_maker(list)
                        self.artist_listings.append(link)
                    # print(self_artist_listings)

                    # next pages
                    next_pages = soup.find('div', class_='pagerfanta').find('nav')
                    next_pages = next_pages.find_all('a')
                    for next_ in next_pages:
                        link = next_.get('href')
                        if self.website.domain not in link:
                            link = self.link_maker(link)
                        if link not in self.listy:
                            self.listy.append(link)

                    # print(listy)
                    # print(len(listy))

                    with concurrent.futures.ThreadPoolExecutor() as executor:
                        trig = executor.map(recurr, self.listy)
                    for trigger in trig:
                        pass
                except AttributeError:
                    visited.discard(url)
                    pass

Beispiel #8

0

Datei anzeigen

Datei: artsy.py Projekt: SVwrite/Web_Scrapers

        def gal(url):
            soup_ = TheMiner.fetch_page(url)
            if soup_ is not None:
                try:
                    artist_url = soup_.find(
                        'div',
                        class_=re.compile(
                            r'Box-sc-15se88d-0 GridColumns__Cell-sc-1g9p6xx-1 cviiXL\.*'
                        ))
                    artist_url = artist_url.find(
                        'a',
                        class_=re.compile(
                            r'Box-sc-15se88d-0 Flex-cw39ct-0\.*'))['href']

                    if self.website.domain in artist_url:
                        pass
                    else:
                        artist_url = self.link_maker(artist_url)
                    if artist_url not in self.artist_listings:
                        self.artist_listings.append(artist_url)

                except AttributeError:
                    pass
                except TypeError:
                    pass

Beispiel #9

0

Datei anzeigen

Datei: bare_bones.py Projekt: SVwrite/Web_Scrapers

    def get_artist_listings(self):
        soup = TheMiner.fetch_page(self.website.start_url)

        if soup is not None:
            artist_thumbnails =
            for artist in artist_thumbnails:
                self.artist_listings.append(artist)

Beispiel #10

0

Datei anzeigen

Datei: bare_bones.py Projekt: SVwrite/Web_Scrapers

        def recurrent(i_url, depth):
            soup = TheMiner.fetch_page(i_url)
            if soup is not None:
                # Gather a list of all the products.
                for product in product_list:
                    # Check for price, discard the links that do not have a price.
                    item_price =
                    # Discard the data that does not have a price. (RULE : 6)
                    if not item_price == "SOLD":
                        product_link =
                        # Discarding urls that do not take us to paintings and sculptures. (RULE : 1)
                        if "/sculpture/" in product_link or "/painting/" in product_link:
                            if product_link not in self.artwork_listings:
                                self.artwork_listings.append(product_link)

                # Get artist data if depth is "1", if depth is more than "1" ignore this block.
                # To pick the artist's data, from the first page of the listings.
                if depth == 1:
                    # Calling the function to fetch the artist data, and return artist_id
                    self.get_artist_data(soup, i_url)

                # Find the links to all other pages.
                next_pages =
                for next_page in next_pages:
                    next_page =
                    depth += 1
                    recurrent(next_page, depth)

Beispiel #11

0

Datei anzeigen

Datei: artsper.py Projekt: SVwrite/Web_Scrapers

    def seller_info(self, soup):

        seller_bundle =[]
        # Seller name
        # Seller's website
        # Seller's location
        # Return seller_id, seller_bundle
        # Write data to table seller's in db.
        try:
            A = soup.find('div', id = 'top-seller')
            B = A.find('a')
            seller_name = str(B.text).strip()
            location = str(A.find('p', class_='subtitle')).strip()
            # if seller and location are already recorded in the global seller variable, we fetch the seller_id and
            # return it .
            seller_name = "_".join([seller_name, location])
            if seller_name in SELLER_INFO.keys():
                seller_id = SELLER_INFO(seller_name)
                print(f"We have a seller for seller id {seller_id}, named {seller_name}")
                return seller_id, None
            link = B['href']
            if 'galeries-d-art' in str(link):
                link = re.sub('galeries-d-art', 'art-galleries', link)

        except AttributeError:
            link = None
        except TypeError:
            link = None
        if link is not None:
            # Moving to seller page now.!!
            # Read the name and location before moving to the next page.
            soup = TheMiner.fetch_page(link)
            # visited.discard(link)
            if soup is not None:
                try:
                    A = soup.find('div', id = 'top-seller')
                    seller_name = A.h1.text.strip()
                    # print(seller_name)
                except AttributeError:
                    return 1, seller_bundle
                try:
                    location = A.find('p', class_="subtitle").text.strip()
                    # print(location)
                except AttributeError:
                    location = None
                try:
                    website = str(soup.find('ul', id="websites").a['href']).strip()
                    # print(website)
                except AttributeError:
                    website = None
                except TypeError:
                    website = None

                seller_bundle.append(seller_name)
                seller_bundle.append(location)
                seller_bundle.append(website)
                return 0, seller_bundle

        return 1, seller_bundle

Beispiel #12

0

Datei anzeigen

Datei: artsper.py Projekt: SVwrite/Web_Scrapers

    def get_artist_listings(self):
        soup = TheMiner.fetch_page(self.website.start_url)

        if soup is not None:
            artist_thumbnails = soup.find('div', class_='artists-thumbnails')
            artist_thumbnails = artist_thumbnails.find_all('div', class_='artists-thumbnails__item')
            for artist in artist_thumbnails:
                self.artist_listings.append(str(artist.a['href']).strip())

Beispiel #13

0

Datei anzeigen

    def miner(self):

        # self.get_artist_listings()
        # print(kazoart.artist_listings)
        # print(len(self.artist_listings))

        self.get_artwork_listings_master()
        # get_artwork_listings_master -> get_artwork_listings_slave
        # We're still not done with artist data
        print(len(self.artwork_listings))

        self.get_artwork_data_master()

        # DATA COLLECTION COMPLETED FOR THIS MODULE.
        # DOWNLOADING IMAGES NOW.
        print("downloading images now.")
        TheMiner.sir_image_manager(chunk_size=100)

Beispiel #14

0

Datei anzeigen

Datei: artsper.py Projekt: SVwrite/Web_Scrapers

def main():
    # art_page_url = 'https://www.artsper.com/us/contemporary-artworks/painting/1147236/les-deux-freres'
    art_page_url = "https://www.artsper.com/in/contemporary-artworks/painting/189196/candy-zinzin-de-lespace"
    artsperpainters = Website('https://www.artsper.com',
                              'https://www.artsper.com/us/contemporary-artists/youngtalents/painters?',
                              "ARTSPER")

    a = Artsper(artsperpainters)
    # print(a.get_art_data_core(art_page_url))
    a.seller_info(TheMiner.fetch_page(art_page_url))

Beispiel #15

0

Datei anzeigen

Datei: maine.py Projekt: SVwrite/Web_Scrapers

def main():
    start = time.perf_counter()

    # Creating SELLER_INFO
    sellers = db.Sellers()
    sellers.read_data_sellers()

    # Creating ARTIST_INFO
    artists = db.Artist()
    artists.read_artist_data()

    artsperpainters = Website(
        'https://www.artsper.com',
        'https://www.artsper.com/us/contemporary-artists/youngtalents/painters?',
        "ARTSPER")

    a_m = Artsper(artsperpainters)
    a_m.artsper_mine()

    finish = time.perf_counter()
    print(
        f"Lap Completed in {round(finish - start, 2)}, seconds.\n Starting sculptures"
    )

    artspersculptors = Website(
        'https://www.artsper.com',
        'https://www.artsper.com/us/contemporary-artists/youngtalents/sculptors-artists',
        "ARTSPER")

    a_m = Artsper(artspersculptors)
    a_m.artsper_mine()

    finish = time.perf_counter()

    print(
        f"Lap Completed in {round(finish - start, 2)}, seconds.\n Downloading and updating images"
    )

    TheMiner.sir_image_manager()

    finish = time.perf_counter()
    print(f"Finished in {round(finish - start, 2)}, seconds")

Beispiel #16

0

Datei anzeigen

Datei: artsper.py Projekt: SVwrite/Web_Scrapers

 def get_listing_pages(self, url):
     print("Fetching All Listing pages")
     soup = TheMiner.fetch_page(url)
     # Pop out the url from visited so that it can be used again while fetching artists.
     visited.remove(url)
     self.listing_pages.append(url)
     listings = soup.find('div', class_="paginator")
     listings = listings.find_all('a')
     for lis in listings:
         u = self.website.url_maker(lis['href'])
         # Dealing with sites that throw the scraper on french webpages of the artworks.!!
         if "oeuvres-d-art-contemporain" in u:
             re.sub("oeuvres-d-art-contemporain", "contemporary-artworks", u)
         if u not in self.listing_pages:
             self.listing_pages.append(u)

Beispiel #17

0

Datei anzeigen

Datei: artsy.py Projekt: SVwrite/Web_Scrapers

    def get_artwork_listings_slave(self, url):
        # Runs on artist_listings.

        soup = TheMiner.fetch_page(url)
        if soup is not None:

            # Gather a list of all the products.
            main_ = soup.find('main', id='main')
            main_ = main_.find('div',
                               class_=re.compile(r'Box-sc-15se88d-0\.*'))
            # print(main.prettify())
            main_ = main_.find(
                'div',
                class_=re.compile(
                    r'Box-sc-15se88d-0 Shelf__Container-sc-1kdkue-0\.*'))
            # print(main.prettify())
            try:
                main1 = main_.find(
                    'div',
                    class_=re.compile(
                        r'Box-sc-15se88d-0 FullBleed-g9qwfe-0\.*'))
                product_list = main1.find_all('li')
            except AttributeError:
                try:
                    main1 = main_.find(
                        'div',
                        class_=re.compile(
                            r'Box-sc-15se88d-0 FullBleed-g9qwfe-0\.*'))
                    product_list = main1.find_all('li')
                    # print("REGEX")
                except AttributeError:
                    product_list = None

            if product_list is not None:
                # print(product_list)
                for product in product_list:
                    # All the products here are "Available for Sale."
                    if self.website.domain not in product.a['href']:
                        product_link = self.link_maker(product.a['href'])
                    else:
                        product_link = product.a['href']

                    if product_link not in self.artwork_listings:
                        # print(product_link)
                        self.artwork_listings.append(product_link)

                # Sending the soup to fetch artist's data, make artist listings.
                self.get_artist_data(soup, url)

Beispiel #18

0

Datei anzeigen

Datei: artsy.py Projekt: SVwrite/Web_Scrapers

        def recurr(url):
            soup = TheMiner.fetch_page(url)
            if soup is not None:
                try:
                    container = soup.find(
                        'div',
                        class_=re.compile(
                            r'LoadingArea__Container-sc-1cnoyb0-2\.*'))
                    artist_thumbnails = container.find_all(
                        'div',
                        class_=re.compile(
                            r'GridItem__ArtworkGridItem-l61twt-3\.*'))
                    # print(container.prettify())
                    for artist in artist_thumbnails:
                        arti = artist.div.a['href']
                        if self.website.domain in arti:
                            artist = arti
                        else:
                            artist = self.link_maker(arti)
                        if artist not in self.first_prod_list:
                            self.first_prod_list.append(artist)
                except AttributeError:
                    print("Something went wrong for url {}")

                try:
                    next_pages = soup.find(
                        'nav',
                        class_=re.compile(
                            r'Box-sc-15se88d-0 Text-sc-18gcpao-0 ibHUpM\.*'))
                    next_pages = next_pages.find_all(
                        'a', class_=re.compile(r'Link-oxrwcw-0\.*'))
                    for a in next_pages:
                        link = self.link_maker(a['href'])
                        if link not in listy:
                            listy.append(link)
                except AttributeError:
                    pass

                with concurrent.futures.ThreadPoolExecutor() as executor:
                    beta = executor.map(recurr, listy)
                for alpha in beta:
                    pass

Beispiel #19

0

Datei anzeigen

Datei: artsper_new.py Projekt: SVwrite/Web_Scrapers

        def recurrent(i_url, depth):
            soup = TheMiner.fetch_page(i_url)
            if soup is not None:

                artwork = soup.find('div', class_="catalog")
                artwork = artwork.find_all('figure')
                for art in artwork:
                    # If listing is sold, don't pick it up.
                    try:
                        sold = art.find('p', class_='price soldout sold').text
                        sold = True
                    except AttributeError:
                        sold = False

                    link = art.a['href']
                    if 'oeuvres-d-art-contemporain' in link:
                        link = re.sub('oeuvres-d-art-contemporain',
                                      'contemporary-artworks', link)
                    if link not in self.artwork_listings and not sold:
                        la = str(link).split('/')
                        if 'painting' in la or 'sculpture' in la:
                            self.artwork_listings.append(link)

                if depth == 1:
                    # Calling the function to fetch the artist data, and return artist_id
                    self.get_artist_data(soup, i_url)
                    # This block picks the urls of pages for artists who have listings on more than one pages.
                    # And launches the code to pick the artwork_listings and artist data
                    try:
                        next_ = []
                        listings = soup.find('div', class_="paginator")
                        listings = listings.find_all('a')
                        for li in listings:
                            ur = self.website.url_maker(li['href'])
                            next_.append(ur)
                            # print(ur)
                        for ur in next_:
                            recurrent(ur, depth + 1)
                    except AttributeError:
                        # For Artists who do not have a second listings page. They'll throw an AttributeError
                        pass

Beispiel #20

0

Datei anzeigen

Datei: artsper_new.py Projekt: SVwrite/Web_Scrapers

        def recur(i_url, depth):
            soup = TheMiner.fetch_page(i_url)
            if soup is not None:
                figures = soup.find_all('figure')
                for figure in figures:
                    self.artist_listings.append(str(figure.a['href']).strip())

            if depth == 1:
                next_ = []
                listings = soup.find('div', class_="paginator")
                listings = listings.find_all('a')
                for lis in listings:
                    u = self.website.url_maker(lis['href'])
                    # Dealing with sites that throw the scraper on french webpages of the artworks.!!
                    if "oeuvres-d-art-contemporain" in u:
                        re.sub("oeuvres-d-art-contemporain",
                               "contemporary-artworks", u)
                    if u not in next_:
                        next_.append(u)
                for link in next_:
                    recur(link, depth + 1)

Beispiel #21

0

Datei anzeigen

Datei: artsy.py Projekt: SVwrite/Web_Scrapers

    def artist_id_slave(self, artist_url):
        visited.discard(artist_url)
        soup = TheMiner.fetch_page(artist_url)
        if soup is not None:
            self.get_artist_data(soup, artist_url)
            # Getting the key from KEY_INFO
            if artist_url in KEY_INFO.keys():
                key = KEY_INFO.get(artist_url)
                # Getting artist_id using the key from ARTIST_INFO
                if key in ARTIST_INFO.keys():
                    artist_id = ARTIST_INFO.get(key)
                    return artist_id
                else:
                    print("ARTIST_ID_SLAVE : Artist id not in ARTIST_INFO")
                    return None
            else:
                print("ARTIST_ID_SLAVE : Could not find artist_id")
                return None

        else:
            print("ARTIST_ID_SLAVE : Soup not returned")
            return None

Beispiel #22

0

Datei anzeigen

Datei: singulart.py Projekt: SVwrite/Web_Scrapers

    def get_artwork_listings_slave(self, url):

        soup = TheMiner.fetch_page(url, ghost=True)
        # Artist's info and artwork listings are available on the same page.
        if soup is not None:
            try:
                name = soup.find('div', class_='artist-intro').find('div', class_='content').h1.text
                # Name will cause the crash if the page is not returned
                block = soup.find_all('div', class_='artist-container artist-container--details')
                print(f"BLOCK : {len(block)}")
                try:
                    for chunk in block:
                        items = chunk.find_all('figure', class_='artwork-item artwork-item--details')
                        print(f"ITEMS : {len(items)}")

                        for piece in items:
                            paise = piece.find('div', class_='meta').text.strip()
                            # print(paise)
                            if "Sold" not in str(paise):
                                # print("B")
                                a = piece.find('a')['href']
                                if self.website.domain not in a:
                                    a = self.link_maker(a)
                                if a not in self.artwork_listings:
                                    self.artwork_listings.append(a)

                except AttributeError:
                    # print("A")
                    pass

                self.get_artist_data(soup, url)

            except AttributeError:
                print("B")
                # Urls that get blocked are discarded from visited and added to listy for a recall. (linear if listy is
                # small and multithreaded if listy is large enough till, its brought of size.
                visited.discard(url)
                self.listy.append(url)

Beispiel #23

0

Datei anzeigen

def get_artwork_data_slave(url):
    soup = TheMiner.fetch_page(url, ghost=True)
    if soup is not None:
        # Initiation

        try:
            # Artist_url
            artist_url = soup.find('div', class_='artwork-focus').find_all('div', class_='col-md-12 col-lg-6')
            try:
                artist_url = artist_url[1].find('h2').a['href']
                if self_website_domain not in artist_url:
                    artist_url = self_link_maker(artist_url)
            except AttributeError:
                artist_url = None

                # Artist_id
                artist_id = self.artist_id


        except AttributeError:
            # Comes here if the page is not returned by the website.
            visited.discard(url)
            self_listy.append(url)

Beispiel #24

0

Datei anzeigen

Datei: artsper.py Projekt: SVwrite/Web_Scrapers

        def recurrent(i_url, depth):
            soup = TheMiner.fetch_page(i_url)
            if soup is not None:
                product_list = soup.find('div', class_='product-list-wrapper')
                product_list = product_list.find_all('div', class_='grid-item')
                for product in product_list:
                    item_price = str(product.find('div', class_='grid-item-price').text).strip().upper()
                    # Discard the data that does not have a price.
                    if not item_price == "SOLD":
                        product_link = str(product.a['href']).strip()
                        self.artwork_listings.append(product_link)

                # Get artist data if depth is "1", if depth is more than "1" ignore this block.
                if depth == 1:
                    # Calling the function to fetch the artist data, and return artist_id
                    self.get_artist_data(soup, i_url)

                next_page = soup.find('div', class_='page-browser')
                if next_page is not None:
                    next_page = next_page.find('div', class_='page-browser-next').a
                    if next_page is not None:
                        next_page = self.website.domain + str(next_page['href'])
                        depth += 1
                        recurrent(next_page, depth)

Beispiel #25

0

Datei anzeigen

Datei: artsy.py Projekt: SVwrite/Web_Scrapers

    def get_artwork_data_slave(self, url):
        # print("ARTWORK DATA SLAVE STARTS")
        soup = TheMiner.fetch_page(url)
        # print("A")
        if soup is not None:
            # print("ARTWORK DATA SLAVE GETS SOUP")
            # Field initiation ::

            artwork = None
            price = None
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None
            # Material to be added to technique
            technique = None

            seller_id = None
            artist = None
            medium = None

            # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
            # print("A.1")

            # Seller_url
            seller_url = None
            seller_box = soup.find_all(
                'div',
                re.compile(
                    r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0\.*'
                ))
            for se in seller_box:
                if se.get('data-test') == 'aboutTheWorkPartner':
                    try:
                        seller_url = se.find('a')['href']
                        if self.website.domain not in seller_url:
                            seller_url = self.link_maker(seller_url)
                    except TypeError:
                        seller_url = se.next.next.next.next.text
            # print(seller_url)

            # seller_id
            if seller_url is not None:
                seller_id = self.get_seller_id(seller_url)

            # artist url
            artist_url = None
            artist_box = soup.find_all(
                'div',
                re.compile(
                    r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0\.*'
                ))
            for ar in artist_box:
                if ar.get('data-test') == 'artistInfo':
                    try:
                        artist_url = ar.find('a')['href']
                        if self.website.domain not in artist_url:
                            artist_url = self.link_maker(artist_url)
                    except TypeError:
                        pass
            # print(artist_url)

            artist_id = self.get_artist_id(artist_url)
            # print(f"Seller id {seller_id} \nArtist id {artist_id}")
            # except AttributeError:
            #     pass

            # Medium
            try:
                medium = soup.find(
                    'dl', class_='Box-sc-15se88d-0 Flex-cw39ct-0 bKPevV'
                ).dd.text.strip()
                if "SCULPTURE" in str(medium).upper():
                    medium = "Sculpture"
                elif "PAINTING" in str(medium).upper():
                    medium = "Painting"
                else:
                    medium = None
            except AttributeError:
                pass
            # print(f"Medium {medium}")

            # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4)
            if seller_id is not None and artist_id is not None and medium is not None:

                try:
                    price = soup.find_all(
                        'div',
                        class_=re.compile(
                            r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*'))
                    for p in price:
                        if p.get('data-test') == 'SaleMessage':
                            price = p.text
                            break

                    temp = ""
                    for i in price:
                        if i == "-":
                            break
                        if i.isdigit():
                            temp += i
                        if i == ".":
                            temp += i

                    price = float(temp) * rate
                    # Price
                    # print(price)
                except AttributeError:
                    price = None
                except ValueError:
                    price = None
                except TypeError:
                    price = None

                # RULE : 5
                if price is not None:

                    # Find artist, artwork, year, type_, dimensions, support, frame, signature, authenticity,
                    # about, image_loc(actual url of the image), and technique

                    # Wish the code to break if either Artist's name or Artwork's name are not found.
                    # Artist
                    artist_name = soup.find_all(
                        'div', class_=re.compile(r'Box-sc-15se88d-0'))
                    for a in artist_name:
                        if a.get('data-test') == 'artworkSidebar':
                            artist_ = a.find_all(
                                'div',
                                class_=re.compile(
                                    r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*'))
                            for a in artist_:
                                if len(a.text.strip()) != 0:
                                    artist = a.text
                                    # print(artist)
                                    break
                            break
                    # print(artist)

                    # Artwork
                    artwork_block = soup.find('h1').text.split(",")
                    artwork = artwork_block[0].strip()
                    try:
                        year = artwork_block[-1].strip()
                        t = ""
                        for y in year:
                            if str(y) == "-":
                                break
                            if str(y).isnumeric():
                                t += y
                        year = int(t)
                    except ValueError:
                        year = None

                    # type(unique or what)
                    try:
                        type_ = soup.find(
                            'h1'
                        ).nextSibling.nextSibling.nextSibling.text.strip()
                    except AttributeError:
                        pass

                    # Dimensions
                    try:
                        dimensions = soup.find(
                            'h1').nextSibling.nextSibling.find_all('div')
                        for dim in dimensions:
                            if 'cm' in dim.text:
                                dimensions = dim.text.strip()
                    except AttributeError:
                        pass

                    # Technique
                    try:
                        technique = soup.find('h1').nextSibling.text.strip()
                        # print(technique)

                    except AttributeError:
                        pass

                    # Support, frame, sign, auth, about
                    # frame, auth , sign

                    try:
                        bundle = soup.find_all(
                            'div',
                            class_=re.compile(
                                r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0 BorderBox-sc-18mwadn-0 StackableBorderBox-sc-1odyc7i-0\.*'
                            ))
                        for b in bundle:
                            if b.get('data-test') == 'aboutTheWorkPartner':
                                bud = b.nextSibling
                                # print(bud.prettify())
                                break
                        bundle = bud.find_all('dl')
                        for dl in bundle:

                            if dl.next.text.strip() == 'Signature':
                                signature = dl.dd.text.strip()
                                continue

                            if dl.dt.text.strip(
                            ) == 'Certificate of authenticity':
                                authenticity = dl.dd.text.strip()
                                continue

                            if dl.dt.text.strip() == 'Frame':
                                frame = dl.dd.text.strip()
                                continue
                    except AttributeError:
                        pass

                    try:
                        about = soup.find(
                            'div',
                            class_='Box-sc-15se88d-0 Text-sc-18gcpao-0  gPzDV'
                        ).find('div',
                               class_='ReadMore__Container-sc-1bqy0ya-0 guOJdN'
                               ).p.text.strip().split("  ")

                        t = ""
                        for a in about:
                            t += a.strip()
                            t += " "
                        about = t
                    except AttributeError:
                        about = None

                    # Image location
                    try:
                        image_loc = soup.find_all('div',
                                                  class_='Box-sc-15se88d-0')
                        for loc in image_loc:
                            if loc.get('data-test') == 'artworkImage':
                                image_loc = loc.find('img').get('src')
                                break
                    except AttributeError:
                        pass

                    artwork_bundle = {
                        "artwork_title": artwork,
                        "artist_name": artist,
                        "year": year,
                        "price": price,
                        "Medium": medium,
                        "Type": type_,
                        "Dimensions": dimensions,
                        "Support": support,
                        "Frame": frame,
                        "Signature": signature,
                        "Authenticity": authenticity,
                        "About": about,
                        "platform": self.website.platform,
                        "image_addr": image_loc,
                        "seller_id": seller_id,
                        "artist_id": artist_id,
                        "url": url,
                        "technique": technique
                    }
                    # print(artwork_bundle)

                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else:
                    pass
                    # print(f"Skipping {url}\n PRICE : {price}")
            else:
                pass
                # print(f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}")
        else:
            pass

Beispiel #26

0

Datei anzeigen

Datei: artsper.py Projekt: SVwrite/Web_Scrapers

    def get_artwork_data_slave(self, url):
        soup = TheMiner.fetch_page(url)
        if soup is not None:

            # Field initiation :: Artwork_title, artist, price, seller_id :: (picked),
            # medium, type, dimension, frame, authenticity, about  :: year, support, signature
            # artist_id, Image_loc = None

            seller_id = None
            artist = None
            artwork = None
            price = None

            # Material to be added to medium
            material = None

            medium = None  # (painting or sculpture)
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None

            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None

            seller_url = str(soup.find('div', class_='product-artist').a.get('href')).strip()
            # We want the code to break if this entry is not found so that we can fix it.
            # THE PAGE MUST HAVE A SELLER.

            # Seller_id
            if seller_url is not None:
                if seller_url in SELLER_INFO:
                    seller_id = SELLER_INFO.get(seller_url)
                    print(seller_id)
                else:
                    self.get_seller_data(seller_url)
                    if seller_url in SELLER_INFO:
                        seller_id = SELLER_INFO.get(seller_url)
                    else:
                        if seller_id is None:
                            print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN SELLER_INFO. WE SHALL BREAK.")
            else:
                if seller_id is None:
                    print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN SELLER_INFO. WE SHALL BREAK.")

            # Artist_id
            if seller_url is not None:
                if seller_url in ARTIST_INFO:
                    artist_id = ARTIST_INFO.get(seller_url)
                    print(artist_id)
                else:
                    if artist_id is None:
                        print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN ARTIST_INFO. WE SHALL BREAK.")
            else:
                # If it ever comes to here, the page will not have a Seller/Artist
                if artist_id is None:
                    print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN ARTIST_INFO. WE SHALL BREAK.")

            A = soup.h1
            B = A.find('div', class_='product-artist')
            artist = str(B.a.text).strip()
            # Artist
            print(artist)

            artwork = str(A.find('div', class_='product-name').text).strip()
            # Artwork
            print(artwork)

            price = str(soup.find('div', class_='product-price').find('div', class_='p-price-container').text).strip()
            temp = ""
            for i in price:
                if i.isdigit():
                    temp += i
            price = int(temp)
            # Price
            print(price)

            product_details_desc = soup.find('div', class_='product-details_desc')
            product_details = product_details_desc.find_all('div', class_='tech-item')

            for detail in product_details:
                label = str(detail.find('div', class_='tech-label').text).strip().upper()
                value = str(detail.find('div', class_='tech-value').text).strip()
                print(label)
                print(value)

                if label == 'TECHNIQUE':
                    medium = value
                elif label == 'TYPE':
                    type_ = value
                elif label == 'MATERIAL':
                    # We don't need material. Adding material to medium??
                    material = value
                elif label == 'DIMENSIONS':
                    dimensions = value
                elif label == 'FRAMING':
                    frame = value
                elif label == 'QUALITY GUARANTEE':
                    authenticity = value

                # if that is not here, it'll throw errors.
                # elif label == ''

            try:
                about = str(product_details_desc.find('div', class_='desc text-1').text).strip()
            except AttributeError:
                about = None

            # If material is None, we don't add it to medium.
            if material is not None:
                # If medium is None, we make it a string before adding material to it.
                if medium is None:
                    medium = ""
                else:
                    medium += " "
                medium += material

            # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None,
            #     Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None,
            #      platform=None, image_addr=None, seller_id=None, artist_id=None)

            artwork_bundle = {"artwork_title": artwork, "artist_name": artist, "year": year, "price": price,
                              "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support,
                              "Frame": frame, "Signature": signature, "Authenticity": authenticity,
                              "About": about, "image_addr": image_loc, "seller_id": seller_id,
                              "artist_id": artist_id}

            self.write_artwork_data(**artwork_bundle)

Beispiel #27

0

Datei anzeigen

Datei: singulart.py Projekt: SVwrite/Web_Scrapers

    def get_artwork_data_slave(self, url):
        soup = TheMiner.fetch_page(url, ghost=True)
        if soup is not None:

            # Field initiation ::

            artwork = None
            price = None
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None
            # Material to be added to technique
            technique = ""


            seller_id = None
            artist = None
            # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
            if "/painting/" in str(url):
                medium = "Painting"  # (painting or sculpture)
            elif "/sculpture/" in str(url):
                medium = "Sculpture"
            else:
                # So that url leaks don't break the code.
                medium = None

            # Seller_id
            seller_url =
            # We'll let it crash at seller_url not found because that is the way of the world.
            seller_id = self.get_seller_id(seller_url)

            # Artist_id
            artist_url =
            artist_id = self.get_artist_id(artist_url)

            # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4)
            if seller_id is not None and artist_id is not None and medium is not None:



                try :
                    price
                    temp = ""
                    for i in price:
                        if i.isdigit():
                            temp += i
                        if i == ".":
                            temp += i
                    price = float(temp)
                    # Price
                    # print(price)
                except AttributeError:
                    price = None
                except ValueError:
                    price = None

                # RULE : 5
                if price is not None:

                    # Find artist, artwork, year, type_, dimensions, support, frame, signature, authenticity,
                    # about, image_loc(actual url of the image), and technique

                    # Wish the code to break if either Artist's name or Artwork's name are not found.
                    # Artist
                    print(artist)

                    # Artwork
                    print(artwork)

                    try:
                        about =
                    except AttributeError:
                        about = None

                    artwork_bundle = {"artwork_title": artwork, "artist_name": artist, "year": year, "price": price,
                                      "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support,
                                      "Frame": frame, "Signature": signature, "Authenticity": authenticity,
                                      "About": about, "platform": self.website.platform, "image_addr": image_loc,
                                      "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique}

                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else :
                    print(f"Skipping {url}\n PRICE : {price}")
            else:
                print(f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}")

Beispiel #28

0

Datei anzeigen

Datei: artsper.py Projekt: SVwrite/Web_Scrapers

    def get_art_data_core(self, url):
        platform = self.website.platform
        artist_name = None
        artwork_title = None
        year = None
        price = None
        Dimensions = None
        Medium = None
        Type = None
        Support = None
        Frame = None
        Signature = None
        Authenticity = None
        About = None
        image_addr = None
        seller_id = None

        soup = TheMiner.fetch_page(url)
        if soup is not None:
            # Data to be picked here.
            # Artist's name, artwork's name, year, Artwork description, Price, Dimensions, Medium(Sculpture/Painting)
            # Type (Copies or Unique), Frame, Support, Authenticity, Website, Image (12)

            seller_id_trigger, seller_bundle = self.seller_info(soup)
            # Seller_id_trigger could be 0, 1 or a real id.(real id comes with bundle =None)
            # seller_id_trigger 0 comes with some data in bundle
            # seller_id 1_trigger comes with no data in the bundle
            if seller_bundle is None:
                seller_id = seller_id_trigger

            # THIS FOLLOW BLOCK OF CODE NEEDS TO BE CONSISTENT ACROSS ALL THE WEBSITE MODULES.
            # Get seller bundle
            elif seller_id_trigger == 0:
                seller_ds = SellerData(*seller_bundle)
                s_bundle = seller_ds.seller_bundle()
                # Write data to table "sellers"
                s_agent = db.Sellers()
                s_agent.create_table_sellers()
                seller_id = s_agent.insert_data_sellers(*s_bundle)
                # Writing the seller_info for quick use and reduce the number of clicks
                seller_name = seller_bundle[0]
                location = seller_bundle[1]
                SELLER_INFO["_".join([seller_name, location])] = seller_id

            else:
                seller_id = seller_id_trigger

            try:
                A = soup.find('section', id='informations')
                B = A.find('div', class_='relative')

                try:
                    ## ARTIST'S NAME
                    artist_name = B.find('span', class_='primary-title').text.strip()
                    # print(artist_name)
                except:
                    artist_name = None
                try:
                    ## ARTWORK'S NAME
                    C = B.find('span', class_='secondary-title').text.strip()
                    artwork_ = C.split(',')
                    artwork_title = ""
                    for a in range(len(artwork_)-1):
                        if a == 0:
                            artwork_title = artwork_[a]
                            continue
                        artwork_title = artwork_title + ", " + artwork_[a].strip()
                    # print(artwork_title)

                    # ARTWORK YEAR
                    year = C.split(',')[-1].strip()
                    # print(year)
                except:
                    artwork_title = None
                    year = None
                try:
                    # PRICE
                    price = A.find('p', class_='media-price price').text.strip()
                    number = ''
                    for p in price:
                        if p == '-':
                            break
                        if p.isdigit():
                            number += str(p)
                    price = int(number)
                    # print(price)
                except:
                    price = None

                try:
                    # Image url
                    B = A.find('div', id='img-container')
                    image_addr = B.find('img', id='img_original')['data-src']
                    # print(image_addr)
                except:
                    image_addr = None
            except:
                artist_name = None
                artwork_title = None
                year = None
                price = None
                image_addr = None

            try:
                D = soup.find('div', id='tabs-description').ul
                # Contains:: image, dimensions, medium, type, Frame, Support, authenticity, signature
                E = D.find_all('li')
                Dimensions = None
                Medium = None
                Type = None
                Support = None
                Frame = None
                Signature = None
                Authenticity = None
                About = None

                for e in E:
                    a = e.text
                    # Dimensions
                    if 'Dimensions' in a and 'About the artwork' not in a and 'Support' not in a:
                        Dimensions = e.find('p', class_='pull-right').strong.text.strip() + ' (Height x Width x Depth)'
                        dim = True
                        # print(Dimensions)
                        continue

                    # Medium (Sculpture/Painting)
                    if 'Medium' in a and 'About the artwork' not in a:
                        Medium = e.find('p', class_='pull-right').a.text.strip()
                        # print(Medium)
                        continue

                    # Type
                    if 'Type' in a and 'About the artwork' not in a:
                        Type = e.find('p', class_='pull-right text-right').text.strip().split('  ')[0]
                        # print(Type)
                        continue

                    # Support (base)
                    if 'Support' in a and 'About the artwork' not in a:
                        try:
                            f = e.find('p', class_='pull-right text-right').text.strip().split('  ')
                            Support = f[0] + '. ' + f[1].strip('\n')
                            f = e.find('p', class_='pull-right text-right').strong.text.strip().strip('\n')
                            Support += f
                        except IndexError:
                            Support = e.find('p', class_='pull-right text-right').text.strip()
                        # print(Support)
                        continue

                    # Framing
                    if 'Framing' in a and 'About the artwork' not in a:
                        Frame = e.find('p', class_='pull-right').text.strip()
                        # print(Frame)
                        continue

                    # Signature
                    if 'Signature' in a and 'About the artwork' not in a:
                        Signature = e.find('p', class_='pull-right').text.strip()
                        # print(Signature)
                        continue

                    # Authenticity
                    if 'Authenticity' in a and 'About the artwork' not in a:
                        Authenticity = e.find('p', class_='pull-right text-right').text.strip()
                        # print(Authenticity)
                        continue

                    # Artwork Description
                    if 'About the artwork' in a:
                        About = e.find('p', class_="marg-bot-10")
                        if About is not None:
                            a = e.find('div', class_="description-catalog see-more text-justify").text.strip()
                            About = About.text.strip()
                            About += a
                        else:
                            About = e.find('p', class_='').text.strip()
                        continue
                        # print(About)
            except:
                # Make all the fields Null
                Dimensions = None
                Medium = None
                Type = None
                Support = None
                Frame = None
                Signature = None
                Authenticity = None
                About = None

            result = {"artwork_title": artwork_title, "artist_name": artist_name,  "year": year, "price": price,
                      "Dimensions": Dimensions, "Medium": Medium, "Type": Type, "Support": Support, "Frame": Frame,
                      "Signature": Signature, "Authenticity": Authenticity, "About": About, 'platform': platform,
                      "image_addr": image_addr, "seller_id": seller_id}

            artwork_item = ArtworkData(**result)
            # Downloading images will be done at the end, after every 100, or so instances, we'll write the
            # data from image pool to a db [ image_url and artwork_id ]
            # And download the entire pool of images at the end of the execution.
            # The function for downlaoding the images will have to pick a set of 100 images, the function
            # is with TheMiner in module dataStructures. That function is called by ArtworksData (in datastructures)
            # DON'T THREAD ANYTHING WITH DATA DOWNLOAD FUNCTION AS IT ITSELF IS LAUNCHED ON THREAD (STUPID).
            art_bund = artwork_item.artwork_bundle()

            # WRITING ARTWORK
            dbartwork_agent = db.Artwork()
            dbartwork_agent.create_table_artwork()
            artwork_id = dbartwork_agent.insert_data_artwork(*art_bund)

            # Writing image-info
            # image_addr = result[13]
            image_bundle = artwork_item.image_bundle(artwork_id)
            dbimage_agent = db.Images()
            dbimage_agent.create_table_images()
            # dbimage_agent.insert_data_images(image_addr, artwork_id)
            dbimage_agent.insert_data_images(*image_bundle)

            # Price bundle can only be created once the artwork is written in the db
            price_bund = artwork_item.price_bundle(artwork_id)

            # WRITING PRICES
            dbprice_agent = db.Price()
            dbprice_agent.create_table_prices()
            dbprice_agent.insert_data_prices(*price_bund)

Beispiel #29

0

Datei anzeigen

Datei: kazoart.py Projekt: SVwrite/Web_Scrapers

    def get_artwork_data_slave(self, url):
        soup = TheMiner.fetch_page(url)
        if soup is not None:

            # Field initiation :: Artwork_title, artist, price, seller_id :: (picked),
            # medium, type, dimension, frame, authenticity, about  :: year, support, signature
            # artist_id, Image_loc = None

            seller_id = None
            artist = None
            artwork = None
            price = None

            # Material to be added to technique
            technique = ""

            # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
            if "/painting/" in str(url):
                medium = "Painting"  # (painting or sculpture)
            elif "/sculpture/" in str(url):
                medium = "Sculpture"
            else:
                # So that url leaks don't break the code.
                medium = None

            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None

            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None

            seller_url = str(
                soup.find('div',
                          class_='product-artist').a.get('href')).strip()
            # We want the code to break if this entry is not found so that we can fix it.
            # THE PAGE MUST HAVE A SELLER.

            # Seller_id
            seller_id = self.get_seller_id(seller_url)

            # Artist_id
            artist_url = seller_url
            artist_id = self.get_artist_id(artist_url)

            # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4)
            if seller_id is not None and artist_id is not None and medium is not None:
                A = soup.h1
                B = A.find('div', class_='product-artist')
                artist = str(B.a.text).strip()
                # Artist
                # print(artist)

                artwork = str(A.find('div',
                                     class_='product-name').text).strip()
                # Artwork
                # print(artwork)

                price = str(
                    soup.find('div', class_='product-price').find(
                        'div', class_='p-price-container').text).strip()
                temp = ""
                for i in price:
                    if i.isdigit():
                        temp += i
                    if i == ".":
                        temp += i
                price = float(temp)
                # Price
                # print(price)

                product_details_desc = soup.find('div',
                                                 class_='product-details_desc')
                product_details = product_details_desc.find_all(
                    'div', class_='tech-item')

                for detail in product_details:
                    label = str(detail.find(
                        'div', class_='tech-label').text).strip().upper()
                    value = str(detail.find('div',
                                            class_='tech-value').text).strip()
                    # print(label)
                    # print(value)
                    # For KAZoART, technique(info) goes under Medium, and Material(info) goes under Technique
                    if label == 'TECHNIQUE':
                        technique += " "
                        technique += value
                        technique.strip()
                    elif label == 'TYPE':
                        type_ = value
                    elif label == 'MATERIAL':
                        technique += " "
                        technique = value
                        technique.strip()
                    elif label == 'DIMENSIONS':
                        dimensions = value
                    elif label == 'FRAMING':
                        frame = value
                    elif label == 'QUALITY GUARANTEE':
                        authenticity = value

                    # if that is not here, it'll throw errors.
                    # elif label == ''

                try:
                    about = str(
                        product_details_desc.find(
                            'div', class_='desc text-1').text).strip()
                except AttributeError:
                    about = None

                image_loc = soup.find('div', class_='product-left').find(
                    'div', class_='img-wrapper').img.get('src')
                # print(image_loc)

                # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None,
                #     Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None,
                #      platform=None, image_addr=None, seller_id=None, artist_id=None)

                artwork_bundle = {
                    "artwork_title": artwork,
                    "artist_name": artist,
                    "year": year,
                    "price": price,
                    "Medium": medium,
                    "Type": type_,
                    "Dimensions": dimensions,
                    "Support": support,
                    "Frame": frame,
                    "Signature": signature,
                    "Authenticity": authenticity,
                    "About": about,
                    "platform": self.website.platform,
                    "image_addr": image_loc,
                    "seller_id": seller_id,
                    "artist_id": artist_id,
                    "url": url,
                    "technique": technique
                }

                TheAuthour.write_artwork_price_image(**artwork_bundle)
                # self.write_artwork_data(**artwork_bundle)
            else:
                print(
                    f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}"
                )

Beispiel #30

0

Datei anzeigen

Datei: artsper_new.py Projekt: SVwrite/Web_Scrapers

    def get_artwork_data_slave(self, url):
        soup = TheMiner.fetch_page(url)
        if soup is not None:

            # Field initiation :: Artwork_title, artist, price, seller_id ,
            # medium, type, dimension, frame, authenticity, about  :: year, support, signature
            # artist_id, Image_loc = None

            seller_id = None
            artist = None
            artwork = None
            price = None
            medium = None  # (painting or sculpture)
            technique = ""  # Material and style
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None

            try:
                # PRICE
                A = soup.find('section', id='informations')
                price = str(A.find('p',
                                   class_='media-price price').text).strip()
                number = ''
                for p in price:
                    if p == '-':
                        break
                    if p.isdigit():
                        number += str(p)
                    if p == ".":
                        number += str(p)

                price = float(number)
                # print(price)
            except AttributeError:
                pass
            except TypeError:
                pass

            # Rule : 5
            if price is not None:

                # Seller_id
                try:
                    seller_url = soup.find(
                        'div', id='top-seller').find('a').get('href')
                    if 'galeries-d-art' in str(seller_url):
                        seller_url = re.sub('galeries-d-art', 'art-galleries',
                                            seller_url)

                    # If seller_url is found.
                    seller_id = self.get_seller_id(seller_url)
                except AttributeError:
                    # seller_id = None
                    # There are pages where the seller has no other page. Then we make the url ourselves.

                    seller_url = soup.find('div', id='top-seller').find(
                        'p', class_='highlight-title').text
                    seller_url = str(seller_url).strip()

                    if seller_url in SELLER_INFO:
                        seller_id = SELLER_INFO[seller_url]
                    else:
                        location = soup.find('div', id='top-seller').find(
                            'p', class_='subtitle').text.strip().split(',')
                        location = str(location[-1]).strip()

                        seller_name = seller_url

                        bundle = [
                            seller_url, self.website.platform, seller_name,
                            location, None
                        ]

                        # We write the seller info directly and fetch the seller_id
                        TheAuthour.write_seller(*bundle)
                        seller_id = SELLER_INFO[seller_url]

                # Artist_id
                try:
                    artist_url = soup.find('section', id='informations').find(
                        'div', class_='relative').a.get('href')
                    if "oeuvres-d-art-contemporain" in artist_url:
                        re.sub("oeuvres-d-art-contemporain",
                               "contemporary-artworks", artist_url)
                    artist_id = self.get_artist_id(artist_url)
                except AttributeError:
                    artist_id = None
                    print("\n\n\n\n\n")
                    print(url)
                    print("\n\n\n\n\n")
                    time.sleep(50)

                # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
                la = str(url).split('/')
                if 'painting' in la:
                    medium = "Painting "  # (painting or sculpture)
                elif 'sculpture' in la:
                    medium = "Sculpture"
                else:
                    # So that url leaks don't break the code.
                    medium = None

                # IF either the seller id or artist_id are missing, escape the rest. (Rule : 3)
                # If medium is neither Paintings, not Sculptures. We don't fetch data. ( Rule : 2)
                if seller_id is not None and artist_id is not None and medium is not None:
                    # ______________________________MAIN DATA FETCHING________________________
                    A = soup.find('section', id='informations')
                    B = A.find('div', class_='relative')

                    # ARTIST'S NAME
                    artist = B.find('span',
                                    class_='primary-title').text.strip()
                    # print(artist)

                    # ARTWORK'S NAME
                    C = B.find('span', class_='secondary-title').text.strip()
                    artwork_ = C.split(',')
                    artwork_title = ""
                    for a in range(len(artwork_) - 1):
                        if a == 0:
                            artwork_title = artwork_[a]
                            continue
                        artwork_title = artwork_title + ", " + artwork_[
                            a].strip()
                    artwork = artwork_title
                    # print(artwork)

                    try:
                        # ARTWORK YEAR
                        year = C.split(',')[-1].strip()
                        # print(year)
                    except IndexError:
                        pass
                        # year = None

                    try:
                        # Image url
                        B = A.find('div', id='img-container')
                        image_loc = B.find('img',
                                           id='img_original')['data-src']
                        # print(image_loc)
                    except AttributeError:
                        pass

                    # Contains:: image, dimensions, medium, type, Frame, Support, authenticity, signature
                    try:
                        D = soup.find('div', id='tabs-description').ul
                        E = D.find_all('li')

                        for e in E:
                            a = e.text
                            # Dimensions
                            if 'Dimensions' in a and 'About the artwork' not in a and 'Support' not in a:
                                dimensions = e.find(
                                    'p',
                                    class_='pull-right').strong.text.strip()
                                dim = True
                                # print(dimensions)
                                continue

                            # Medium (Sculpture/Painting) and Technique
                            if 'Medium' in a and 'About the artwork' not in a:
                                technique = e.find(
                                    'p', class_='pull-right').text.split("   ")
                                # print(technique)
                                temp = ""
                                for t in technique:
                                    if t != "":
                                        temp += t.strip()
                                        temp += " "
                                # medium = medium[0]
                                # technique = medium[1]
                                technique = temp
                                # print(technique)
                                continue

                            # Type
                            if 'Type' in a and 'About the artwork' not in a:
                                type_ = e.find('p',
                                               class_='pull-right text-right'
                                               ).text.strip().split('  ')[0]
                                # print(type_)
                                continue

                            # Support (base)
                            if 'Support' in a and 'About the artwork' not in a:
                                try:
                                    f = e.find('p',
                                               class_='pull-right text-right'
                                               ).text.strip().split('  ')
                                    support = f[0] + '. ' + f[1].strip('\n')
                                    f = e.find(
                                        'p', class_='pull-right text-right'
                                    ).strong.text.strip().strip('\n')
                                    support += f
                                except IndexError:
                                    support = e.find(
                                        'p', class_='pull-right text-right'
                                    ).text.strip()
                                # print(support)
                                continue

                            # Framing
                            if 'Framing' in a and 'About the artwork' not in a:
                                frame = e.find(
                                    'p', class_='pull-right').text.strip()
                                # print(frame)
                                continue

                            # Signature
                            if 'Signature' in a and 'About the artwork' not in a:
                                signature = e.find(
                                    'p', class_='pull-right').text.strip()
                                # print(signature)
                                continue

                            # Authenticity
                            if 'Authenticity' in a and 'About the artwork' not in a:
                                authenticity = e.find(
                                    'p',
                                    class_='pull-right text-right').text.strip(
                                    )
                                # print(authenticity)
                                continue

                            # Artwork Description
                            if 'About the artwork' in a:
                                about = e.find('p', class_="marg-bot-10")
                                if about is not None:
                                    a = e.find(
                                        'div',
                                        class_=
                                        "description-catalog see-more text-justify"
                                    ).text.strip()
                                    about = about.text.strip()
                                    about += a
                                else:
                                    about = e.find('p', class_='').text.strip()
                                continue
                                # print(about)
                    except AttributeError:
                        pass

                        # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None,
                        #     Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None,
                        #      platform=None, image_addr=None, seller_id=None, artist_id=None)

                    artwork_bundle = {
                        "artwork_title": artwork,
                        "artist_name": artist,
                        "year": year,
                        "price": price,
                        "Medium": medium,
                        "Type": type_,
                        "Dimensions": dimensions,
                        "Support": support,
                        "Frame": frame,
                        "Signature": signature,
                        "Authenticity": authenticity,
                        "About": about,
                        "platform": self.website.platform,
                        "image_addr": image_loc,
                        "seller_id": seller_id,
                        "artist_id": artist_id,
                        "url": url,
                        "technique": technique
                    }
                    # print(artwork_bundle)
                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else:
                    print(
                        f"SELLER ID :: {seller_id},\nARTIST ID :: {artist_id}")
            else:
                # If the price is not available, we skip the entire process.
                print(f"PRICE NOT FOUND : {price} at {url}")
        else:
            print(f"\n\n\n\n\nURL DIDN'T RETURN : {url}\n\n\n\n\n")