Python TheAuthour Examples, packets.dataStructures.TheAuthour Python Examples

Example #1

0

Show file

File: kazoart.py Project: SVwrite/Web_Scrapers

    def get_artist_data(self, soup, url):
        # Called by self.get_artwork_listings_slave()
        # Pick name, born, country, about

        artist_resume = soup.find('div', class_='artist-resume').find(
            'div', class_='artist-resume_text')
        name = artist_resume.h1.text.strip()
        print(name)
        # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed

        if name is not None:
            try:
                country = artist_resume.find(
                    'p', class_='location').text.strip().split('\n')
                country = country[0].split(',')
                country = country[-1].strip()
                print(country)
            except AttributeError:
                country = None

            about = soup.find('div', id='about').text.strip()
            # About will either be found and be some text or be None.
            # print(about)

            artist_data_pack = [name, None, country, about]
            # pack = [name, born, country, about]
            # self.write_artist_data(*artist_data_pack)
            KEY_INFO[url] = db.Artist.key_maker(artist_data_pack)
            TheAuthour.write_artist(*artist_data_pack)

Example #2

0

Show file

File: singulart.py Project: SVwrite/Web_Scrapers

    def get_seller_id(self, seller_url) -> int:
        # Fetches seller_data, writes it in db, and returns seller_id.
        # bundle = [seller_url, self.website.platform, 'KAZoART', None, url]
        seller_id = None

        if seller_url is not None:
            if seller_url in SELLER_INFO.keys():
                seller_id = SELLER_INFO.get(seller_url)
                # print(seller_id)
            else:
                # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data
                # again with seller_url
                self.get_seller_data(seller_url)
                # wait for a second to make sure that transaction is smooth. Activate this line if errors are thrown.
                # time.sleep(1)
                # Try to fetch seller data again.
                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    # Make a Kazoart style bundle, and write it to obtain a seller_id.
                    bundle = [seller_url, self.website.platform, 'BAREBONES', None, seller_url]
                    # Writing to db.
                    TheAuthour.write_seller(*bundle)
                    # This should generate the seller_id we so desperately desire.
                    # time.sleep(1)
                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    print("FATAL ERROR :: Seller_id not found.")
        else:
            print("FATAL ERROR :: Seller_id not found.")
        # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering
        # data beyond rule 3 check .
        return seller_id

Example #3

0

Show file

File: artsper_new.py Project: SVwrite/Web_Scrapers

    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave
        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        # print("A")
        if soup is not None:
            # print("B")

            A = soup.find('div', id='top-seller')
            seller_name = A.h1.text.strip()
            # print(seller_name)
            # Code will break if seller's name is not found

            try:
                location = A.find('p',
                                  class_="subtitle").text.strip().split(',')
                location = location[-1].strip()
                # print(location)
            except AttributeError:
                location = None
            try:
                website = str(soup.find('ul', id="websites").a['href']).strip()
                # print(website)
            except AttributeError:
                website = None
            except TypeError:
                website = None

            bundle = [
                url, self.website.platform, seller_name, location, website
            ]
            # print(bundle)
            TheAuthour.write_seller(*bundle)

Example #4

0

Show file

File: kazoart.py Project: SVwrite/Web_Scrapers

    def get_seller_id(self, seller_url) -> int:
        # Fetches seller_data, writes it in db, and returns seller_id.
        # bundle = [seller_url, self.website.platform, 'KAZoART', None, url]
        seller_id = None

        if seller_url is not None:
            if seller_url in SELLER_INFO.keys():
                seller_id = SELLER_INFO.get(seller_url)
                # print(seller_id)
            else:
                # Process and create the bundle here.
                bundle = [
                    seller_url, self.website.platform, 'KAZoART', None,
                    seller_url
                ]
                # Writing to db.
                TheAuthour.write_seller(*bundle)

                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    print("FATAL ERROR :: Seller_id not found.")
        else:
            print("FATAL ERROR :: Seller_id not found.")
        # Let's return seller_id, even if it's None.
        return seller_id

Example #5

0

Show file

File: bare_bones.py Project: SVwrite/Web_Scrapers

    def get_artist_data(self, soup, url):
        # Called by self.get_artwork_listings_slave()
        # Pick name, born, country, about

        # Name : Pick artist's name here
        print(name)
        # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed

        if name is not None:
            try:
                # Pick artist's country here.
                print(country)
            except AttributeError:
                country = None

            try:
                # Pick birth year here here.
                print(born)
            except AttributeError:
                born = None

            try:
                # Pick artist's description here.
                print(about)
            except AttributeError:
                about = None

            artist_data_pack = [name, born, country, about]
            # pack = [name, born, country, about]
            # Updating KEY_INFO dictionary.
            KEY_INFO[url] = db.Artist.key_maker(artist_data_pack)
            # Updating the dB with artist listings.
            TheAuthour.write_artist(*artist_data_pack)

Example #6

0

Show file

File: singulart.py Project: SVwrite/Web_Scrapers

    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave and get_seller_id
        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        if soup is not None:

            # Seller's Name
            print(seller_name)
            # Code will break if seller's name is not found

            # Location
            try:
                print(location)
            except AttributeError:
                location = None

            # Website
            try:
                print(website)
            except AttributeError:
                website = None
            except TypeError:
                website = None

            bundle = [url, self.website.platform, seller_name, location, website]
            print(bundle)
            TheAuthour.write_seller(*bundle)

Example #7

0

Show file

    def key_maker(artist_url):
        options = Options()
        options.headless = True
        driver = webdriver.Firefox(options=options)
        visited.discard(artist_url)
        soup = BeautifulSoup(driver.page_source, artist_url)
        if soup is not None:

            n_c = soup.find_all('h2', class_='font_2')
            # Artist's name
            try:
                name = n_c[0].text.strip()
            except IndexError:
                print(n_c)
                name = None
            # print(name)
            # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed

            if name is not None:
                # Country
                try:
                    country = n_c[1].text.strip()
                except AttributeError:
                    country = None

                # About
                try:
                    text = soup.find_all('p', class_='font_8')
                    about = ""
                    for t in text:
                        about += t.text.strip()
                        about += " "
                    # print(about)
                except AttributeError:
                    about = None
                except TypeError:
                    about = None
                # About will either be found and be some text or be None.
                # print(about)

                artist_data_pack = [name, None, country, about]
                # artist_data_pack = [name, born, country, about]
                # pack = [name, born, country, about]
                # Updating KEY_INFO dictionary.
                KEY_INFO[artist_url] = db.Artist.key_maker(artist_data_pack)
                key = KEY_INFO.get(artist_url)
                # Updating the dB with artist listings.
                TheAuthour.write_artist(*artist_data_pack)

                # key = db.Artist.key_maker(artist_data_pack)
                # pack = [name, born, country, about]
                driver.quit()
                return key
            else:
                driver.quit()
                return None

        else:
            return None

Example #8

0

Show file

File: artsper_new.py Project: SVwrite/Web_Scrapers

    def get_artist_data(self, soup, url):
        # Called by self.get_artwork_listings_slave()
        # Pick name, born, country, about

        # PICKING ARTIST DATA
        A = soup.find('div', id='biography')
        # Artist's name
        name = A.h1.text.strip()
        # print(name)
        # Code should break if the name goes missing

        try:
            # Born
            A = soup.find('div', id='biography')
            B = A.find('div', class_='sub-title col-sm-9 col-xs-12')
            bo = B.find('span', class_='birthday-date').text
            born = ""
            for b in bo:
                if b.isdigit():
                    born += b

            born = int(born)
            # print(born)
        except AttributeError:
            born = None

        try:
            # Country
            A = soup.find('div', id='biography')
            B = A.find('div', class_='sub-title col-sm-9 col-xs-12')
            country = B.span.text.strip()
            # print(country)
        except AttributeError:
            country = None

        try:
            # About
            A = soup.find('div', id='biography')
            about = A.find('div',
                           class_='col-sm-9 col-xs-12 biography').text.strip()
            ab = about.split("  ")
            about = ''
            for a in range(len(ab) - 1):
                b = ab[a]
                about = about + "\n" + b.strip()
            about = about.strip()
            # print(about)
        except AttributeError:
            about = None

        artist_data_pack = [name, born, country, about]
        KEY_INFO[url] = db.Artist.key_maker(artist_data_pack)
        TheAuthour.write_artist(*artist_data_pack)

Example #9

0

Show file

File: singulart.py Project: SVwrite/Web_Scrapers

    def get_artist_data(self, soup, url):
        # name, born, country, about
        # pack = [name, born, country, about]
        # no need to run the safety try: except: here because we're not fetching the page here.
        try:
            name = soup.find('div', class_='artist-intro').find('h1').text
            name = str(name).strip()
        except AttributeError:
            name = None

        if name is not None:
            try:
                born = soup.find('p', class_='born').text.strip()
                t = ""
                for b in born:
                    if str(b).isdigit():
                        t += b
                born = int(t)

                if born > 3000:
                    born = str(born)[0:3]

            except AttributeError:
                born = None
            except ValueError:
                born = None

            # Country
            try:
                country = soup.find('div', class_="artist-intro")
                country = country.find('div', class_='h2').text.strip().split("|")
                country = str(country[-1]).strip()
            except AttributeError:
                country = None

            # About
            try:
                about = soup.find('section', class_='artist-bio')
                about = about.find('div', class_='resume').text.strip()
            except AttributeError:
                about = None

            # pack = [name, born, country, about]
            # print(pack)

            artist_data_pack = [name, born, country, about]
            # pack = [name, born, country, about]
            # Updating KEY_INFO dictionary.
            KEY_INFO[url] = db.Artist.key_maker(artist_data_pack)
            # Updating the dB with artist listings.
            TheAuthour.write_artist(*artist_data_pack)

Example #10

0

Show file

File: artsy.py Project: SVwrite/Web_Scrapers

    def get_seller_id(self, seller_url) -> int:
        # Fetches seller_data, writes it in db, and returns seller_id.
        # bundle = [seller_url, self.website.platform, 'KAZoART', None, url]
        # print("GET SELLER ID")
        seller_id = None

        if seller_url is not None:
            if seller_url in SELLER_INFO.keys():
                seller_id = SELLER_INFO.get(seller_url)
                return seller_id
                # print(seller_id)
            else:
                # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data
                # with seller_url
                self.get_seller_data(seller_url)
                # Try to fetch seller data again.
                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                    # If it is not a url, get_seller_data will fail to make an entry.In that case we move to the next part.
                else:
                    # Make a Kazoart style bundle, and write it to obtain a seller_id.
                    # bundle = [seller_url, platform, Seller's name, location, website]
                    bundle = [
                        seller_url, self.website.platform, seller_url, None,
                        seller_url
                    ]
                    # Writing to db.
                    TheAuthour.write_seller(*bundle)
                    # This should generate the seller_id we so desperately desire.
                    # time.sleep(1)
                if seller_url in SELLER_INFO.keys():
                    # This will always run, unless the program is failing unexpectedly.
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    print("FATAL ERROR :: Seller_id not found.")
        else:
            print("FATAL ERROR :: Seller_id not found.")
        # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering
        # data beyond rule 3 check .
        return seller_id

Example #11

0

Show file

File: artsper_new.py Project: SVwrite/Web_Scrapers

    def get_artwork_data_slave(self, url):
        soup = TheMiner.fetch_page(url)
        if soup is not None:

            # Field initiation :: Artwork_title, artist, price, seller_id ,
            # medium, type, dimension, frame, authenticity, about  :: year, support, signature
            # artist_id, Image_loc = None

            seller_id = None
            artist = None
            artwork = None
            price = None
            medium = None  # (painting or sculpture)
            technique = ""  # Material and style
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None

            try:
                # PRICE
                A = soup.find('section', id='informations')
                price = str(A.find('p',
                                   class_='media-price price').text).strip()
                number = ''
                for p in price:
                    if p == '-':
                        break
                    if p.isdigit():
                        number += str(p)
                    if p == ".":
                        number += str(p)

                price = float(number)
                # print(price)
            except AttributeError:
                pass
            except TypeError:
                pass

            # Rule : 5
            if price is not None:

                # Seller_id
                try:
                    seller_url = soup.find(
                        'div', id='top-seller').find('a').get('href')
                    if 'galeries-d-art' in str(seller_url):
                        seller_url = re.sub('galeries-d-art', 'art-galleries',
                                            seller_url)

                    # If seller_url is found.
                    seller_id = self.get_seller_id(seller_url)
                except AttributeError:
                    # seller_id = None
                    # There are pages where the seller has no other page. Then we make the url ourselves.

                    seller_url = soup.find('div', id='top-seller').find(
                        'p', class_='highlight-title').text
                    seller_url = str(seller_url).strip()

                    if seller_url in SELLER_INFO:
                        seller_id = SELLER_INFO[seller_url]
                    else:
                        location = soup.find('div', id='top-seller').find(
                            'p', class_='subtitle').text.strip().split(',')
                        location = str(location[-1]).strip()

                        seller_name = seller_url

                        bundle = [
                            seller_url, self.website.platform, seller_name,
                            location, None
                        ]

                        # We write the seller info directly and fetch the seller_id
                        TheAuthour.write_seller(*bundle)
                        seller_id = SELLER_INFO[seller_url]

                # Artist_id
                try:
                    artist_url = soup.find('section', id='informations').find(
                        'div', class_='relative').a.get('href')
                    if "oeuvres-d-art-contemporain" in artist_url:
                        re.sub("oeuvres-d-art-contemporain",
                               "contemporary-artworks", artist_url)
                    artist_id = self.get_artist_id(artist_url)
                except AttributeError:
                    artist_id = None
                    print("\n\n\n\n\n")
                    print(url)
                    print("\n\n\n\n\n")
                    time.sleep(50)

                # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
                la = str(url).split('/')
                if 'painting' in la:
                    medium = "Painting "  # (painting or sculpture)
                elif 'sculpture' in la:
                    medium = "Sculpture"
                else:
                    # So that url leaks don't break the code.
                    medium = None

                # IF either the seller id or artist_id are missing, escape the rest. (Rule : 3)
                # If medium is neither Paintings, not Sculptures. We don't fetch data. ( Rule : 2)
                if seller_id is not None and artist_id is not None and medium is not None:
                    # ______________________________MAIN DATA FETCHING________________________
                    A = soup.find('section', id='informations')
                    B = A.find('div', class_='relative')

                    # ARTIST'S NAME
                    artist = B.find('span',
                                    class_='primary-title').text.strip()
                    # print(artist)

                    # ARTWORK'S NAME
                    C = B.find('span', class_='secondary-title').text.strip()
                    artwork_ = C.split(',')
                    artwork_title = ""
                    for a in range(len(artwork_) - 1):
                        if a == 0:
                            artwork_title = artwork_[a]
                            continue
                        artwork_title = artwork_title + ", " + artwork_[
                            a].strip()
                    artwork = artwork_title
                    # print(artwork)

                    try:
                        # ARTWORK YEAR
                        year = C.split(',')[-1].strip()
                        # print(year)
                    except IndexError:
                        pass
                        # year = None

                    try:
                        # Image url
                        B = A.find('div', id='img-container')
                        image_loc = B.find('img',
                                           id='img_original')['data-src']
                        # print(image_loc)
                    except AttributeError:
                        pass

                    # Contains:: image, dimensions, medium, type, Frame, Support, authenticity, signature
                    try:
                        D = soup.find('div', id='tabs-description').ul
                        E = D.find_all('li')

                        for e in E:
                            a = e.text
                            # Dimensions
                            if 'Dimensions' in a and 'About the artwork' not in a and 'Support' not in a:
                                dimensions = e.find(
                                    'p',
                                    class_='pull-right').strong.text.strip()
                                dim = True
                                # print(dimensions)
                                continue

                            # Medium (Sculpture/Painting) and Technique
                            if 'Medium' in a and 'About the artwork' not in a:
                                technique = e.find(
                                    'p', class_='pull-right').text.split("   ")
                                # print(technique)
                                temp = ""
                                for t in technique:
                                    if t != "":
                                        temp += t.strip()
                                        temp += " "
                                # medium = medium[0]
                                # technique = medium[1]
                                technique = temp
                                # print(technique)
                                continue

                            # Type
                            if 'Type' in a and 'About the artwork' not in a:
                                type_ = e.find('p',
                                               class_='pull-right text-right'
                                               ).text.strip().split('  ')[0]
                                # print(type_)
                                continue

                            # Support (base)
                            if 'Support' in a and 'About the artwork' not in a:
                                try:
                                    f = e.find('p',
                                               class_='pull-right text-right'
                                               ).text.strip().split('  ')
                                    support = f[0] + '. ' + f[1].strip('\n')
                                    f = e.find(
                                        'p', class_='pull-right text-right'
                                    ).strong.text.strip().strip('\n')
                                    support += f
                                except IndexError:
                                    support = e.find(
                                        'p', class_='pull-right text-right'
                                    ).text.strip()
                                # print(support)
                                continue

                            # Framing
                            if 'Framing' in a and 'About the artwork' not in a:
                                frame = e.find(
                                    'p', class_='pull-right').text.strip()
                                # print(frame)
                                continue

                            # Signature
                            if 'Signature' in a and 'About the artwork' not in a:
                                signature = e.find(
                                    'p', class_='pull-right').text.strip()
                                # print(signature)
                                continue

                            # Authenticity
                            if 'Authenticity' in a and 'About the artwork' not in a:
                                authenticity = e.find(
                                    'p',
                                    class_='pull-right text-right').text.strip(
                                    )
                                # print(authenticity)
                                continue

                            # Artwork Description
                            if 'About the artwork' in a:
                                about = e.find('p', class_="marg-bot-10")
                                if about is not None:
                                    a = e.find(
                                        'div',
                                        class_=
                                        "description-catalog see-more text-justify"
                                    ).text.strip()
                                    about = about.text.strip()
                                    about += a
                                else:
                                    about = e.find('p', class_='').text.strip()
                                continue
                                # print(about)
                    except AttributeError:
                        pass

                        # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None,
                        #     Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None,
                        #      platform=None, image_addr=None, seller_id=None, artist_id=None)

                    artwork_bundle = {
                        "artwork_title": artwork,
                        "artist_name": artist,
                        "year": year,
                        "price": price,
                        "Medium": medium,
                        "Type": type_,
                        "Dimensions": dimensions,
                        "Support": support,
                        "Frame": frame,
                        "Signature": signature,
                        "Authenticity": authenticity,
                        "About": about,
                        "platform": self.website.platform,
                        "image_addr": image_loc,
                        "seller_id": seller_id,
                        "artist_id": artist_id,
                        "url": url,
                        "technique": technique
                    }
                    # print(artwork_bundle)
                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else:
                    print(
                        f"SELLER ID :: {seller_id},\nARTIST ID :: {artist_id}")
            else:
                # If the price is not available, we skip the entire process.
                print(f"PRICE NOT FOUND : {price} at {url}")
        else:
            print(f"\n\n\n\n\nURL DIDN'T RETURN : {url}\n\n\n\n\n")

Example #12

0

Show file

File: artsy.py Project: SVwrite/Web_Scrapers

    def get_artwork_data_slave(self, url):
        # print("ARTWORK DATA SLAVE STARTS")
        soup = TheMiner.fetch_page(url)
        # print("A")
        if soup is not None:
            # print("ARTWORK DATA SLAVE GETS SOUP")
            # Field initiation ::

            artwork = None
            price = None
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None
            # Material to be added to technique
            technique = None

            seller_id = None
            artist = None
            medium = None

            # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
            # print("A.1")

            # Seller_url
            seller_url = None
            seller_box = soup.find_all(
                'div',
                re.compile(
                    r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0\.*'
                ))
            for se in seller_box:
                if se.get('data-test') == 'aboutTheWorkPartner':
                    try:
                        seller_url = se.find('a')['href']
                        if self.website.domain not in seller_url:
                            seller_url = self.link_maker(seller_url)
                    except TypeError:
                        seller_url = se.next.next.next.next.text
            # print(seller_url)

            # seller_id
            if seller_url is not None:
                seller_id = self.get_seller_id(seller_url)

            # artist url
            artist_url = None
            artist_box = soup.find_all(
                'div',
                re.compile(
                    r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0\.*'
                ))
            for ar in artist_box:
                if ar.get('data-test') == 'artistInfo':
                    try:
                        artist_url = ar.find('a')['href']
                        if self.website.domain not in artist_url:
                            artist_url = self.link_maker(artist_url)
                    except TypeError:
                        pass
            # print(artist_url)

            artist_id = self.get_artist_id(artist_url)
            # print(f"Seller id {seller_id} \nArtist id {artist_id}")
            # except AttributeError:
            #     pass

            # Medium
            try:
                medium = soup.find(
                    'dl', class_='Box-sc-15se88d-0 Flex-cw39ct-0 bKPevV'
                ).dd.text.strip()
                if "SCULPTURE" in str(medium).upper():
                    medium = "Sculpture"
                elif "PAINTING" in str(medium).upper():
                    medium = "Painting"
                else:
                    medium = None
            except AttributeError:
                pass
            # print(f"Medium {medium}")

            # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4)
            if seller_id is not None and artist_id is not None and medium is not None:

                try:
                    price = soup.find_all(
                        'div',
                        class_=re.compile(
                            r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*'))
                    for p in price:
                        if p.get('data-test') == 'SaleMessage':
                            price = p.text
                            break

                    temp = ""
                    for i in price:
                        if i == "-":
                            break
                        if i.isdigit():
                            temp += i
                        if i == ".":
                            temp += i

                    price = float(temp) * rate
                    # Price
                    # print(price)
                except AttributeError:
                    price = None
                except ValueError:
                    price = None
                except TypeError:
                    price = None

                # RULE : 5
                if price is not None:

                    # Find artist, artwork, year, type_, dimensions, support, frame, signature, authenticity,
                    # about, image_loc(actual url of the image), and technique

                    # Wish the code to break if either Artist's name or Artwork's name are not found.
                    # Artist
                    artist_name = soup.find_all(
                        'div', class_=re.compile(r'Box-sc-15se88d-0'))
                    for a in artist_name:
                        if a.get('data-test') == 'artworkSidebar':
                            artist_ = a.find_all(
                                'div',
                                class_=re.compile(
                                    r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*'))
                            for a in artist_:
                                if len(a.text.strip()) != 0:
                                    artist = a.text
                                    # print(artist)
                                    break
                            break
                    # print(artist)

                    # Artwork
                    artwork_block = soup.find('h1').text.split(",")
                    artwork = artwork_block[0].strip()
                    try:
                        year = artwork_block[-1].strip()
                        t = ""
                        for y in year:
                            if str(y) == "-":
                                break
                            if str(y).isnumeric():
                                t += y
                        year = int(t)
                    except ValueError:
                        year = None

                    # type(unique or what)
                    try:
                        type_ = soup.find(
                            'h1'
                        ).nextSibling.nextSibling.nextSibling.text.strip()
                    except AttributeError:
                        pass

                    # Dimensions
                    try:
                        dimensions = soup.find(
                            'h1').nextSibling.nextSibling.find_all('div')
                        for dim in dimensions:
                            if 'cm' in dim.text:
                                dimensions = dim.text.strip()
                    except AttributeError:
                        pass

                    # Technique
                    try:
                        technique = soup.find('h1').nextSibling.text.strip()
                        # print(technique)

                    except AttributeError:
                        pass

                    # Support, frame, sign, auth, about
                    # frame, auth , sign

                    try:
                        bundle = soup.find_all(
                            'div',
                            class_=re.compile(
                                r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0 BorderBox-sc-18mwadn-0 StackableBorderBox-sc-1odyc7i-0\.*'
                            ))
                        for b in bundle:
                            if b.get('data-test') == 'aboutTheWorkPartner':
                                bud = b.nextSibling
                                # print(bud.prettify())
                                break
                        bundle = bud.find_all('dl')
                        for dl in bundle:

                            if dl.next.text.strip() == 'Signature':
                                signature = dl.dd.text.strip()
                                continue

                            if dl.dt.text.strip(
                            ) == 'Certificate of authenticity':
                                authenticity = dl.dd.text.strip()
                                continue

                            if dl.dt.text.strip() == 'Frame':
                                frame = dl.dd.text.strip()
                                continue
                    except AttributeError:
                        pass

                    try:
                        about = soup.find(
                            'div',
                            class_='Box-sc-15se88d-0 Text-sc-18gcpao-0  gPzDV'
                        ).find('div',
                               class_='ReadMore__Container-sc-1bqy0ya-0 guOJdN'
                               ).p.text.strip().split("  ")

                        t = ""
                        for a in about:
                            t += a.strip()
                            t += " "
                        about = t
                    except AttributeError:
                        about = None

                    # Image location
                    try:
                        image_loc = soup.find_all('div',
                                                  class_='Box-sc-15se88d-0')
                        for loc in image_loc:
                            if loc.get('data-test') == 'artworkImage':
                                image_loc = loc.find('img').get('src')
                                break
                    except AttributeError:
                        pass

                    artwork_bundle = {
                        "artwork_title": artwork,
                        "artist_name": artist,
                        "year": year,
                        "price": price,
                        "Medium": medium,
                        "Type": type_,
                        "Dimensions": dimensions,
                        "Support": support,
                        "Frame": frame,
                        "Signature": signature,
                        "Authenticity": authenticity,
                        "About": about,
                        "platform": self.website.platform,
                        "image_addr": image_loc,
                        "seller_id": seller_id,
                        "artist_id": artist_id,
                        "url": url,
                        "technique": technique
                    }
                    # print(artwork_bundle)

                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else:
                    pass
                    # print(f"Skipping {url}\n PRICE : {price}")
            else:
                pass
                # print(f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}")
        else:
            pass

Example #13

0

Show file

File: artsy.py Project: SVwrite/Web_Scrapers

    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave and get_seller_id
        # We get to here only after we do not find the seller's info in SELLER_INFO
        # print("GET SELLER DATA")

        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        if soup is not None:
            # print("GET SELLER DATA: SOUP RETURNED")

            seller_name = None
            try:
                # Seller's Name

                seller_box = soup.find('div', id='jumpto--PartnerHeader')
                seller_name = seller_box.h1.text.strip()

                # print(seller_name)
                # Code will break if seller's name is not found
            except AttributeError:
                pass

            if seller_name is not None:
                # print(f"SELLER NAME : {seller_name}")
                # Location
                try:
                    # Location is not available here.
                    location = ""
                    locatio = seller_box.h1.nextSibling()
                    # print(type(locatio))
                    try:
                        location = locatio.text
                    except AttributeError:
                        for l in locatio:
                            location += l.text
                            location += " "
                    # print(location)
                except AttributeError:
                    location = None
                except TypeError:
                    location = None

                # Website
                try:
                    website = soup.find_all('a')
                    for web in website:
                        if "http" in str(web.get('href')):
                            website = web.get('href')
                            print(web.get('href'))
                            break
                    # print(website)
                except AttributeError:
                    website = None
                except IndexError:
                    website = None

                bundle = [
                    url, self.website.platform, seller_name, location, website
                ]
                # print(bundle)
                TheAuthour.write_seller(*bundle)

Example #14

0

Show file

File: artsy.py Project: SVwrite/Web_Scrapers

    def get_artist_data(self, soup, url):
        # Called by self.get_artwork_listings_slave()
        # Pick name, born, country, about
        # dom = etree.HTML(str(soup))

        # Name : Pick artist's name here
        A = soup.find_all(
            'div',
            class_=re.compile(
                r'Box-sc-15se88d-0 GridColumns__Cell-sc-1g9p6xx-1\.*'))
        name = soup.find('h1').text.strip()
        # print(name)
        # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed

        if name is not None:
            try:
                # Pick artist's country here.
                B = A[1].find('h2').text.strip().split(",")
                country = B[0].strip()
                if country == "American":
                    country = "USA"
                elif country == "Japanese":
                    country = "Japan"
                elif "French" in country:
                    country = "France"
                elif "Argentine" in country:
                    country = "Argentina"
                elif "Dutch" in country:
                    country = "Netherlands"
                elif "Indian" in country:
                    country = "India"
                elif "Pakistani" in country:
                    country = "Pakistan"
                elif "Italian" in country:
                    country = "Italy"
                elif "English" in country:
                    country = "UK"
                elif "Chinese" in country:
                    country = "China"
                elif "Hispanic" in country:
                    country = "Spain"
                elif "German" in country:
                    country = "Germany"
                elif "Spanish" in country:
                    country = "Spain"
                elif "Russian" in country:
                    country = "Russia"
                elif "British" in country:
                    country = "UK"
                elif "Mexican" in country:
                    country = "Mexico"
                elif "Brazilian" in country:
                    country = "Brazil"
                elif "Canadian" in country:
                    country = "Canada"
                elif "Belgian" in country:
                    country = "Belgium"
                elif "Israeli" in country:
                    country = "Israel"
                elif "Venezuelan" in country:
                    country = "Venezuela"
                elif "Polish" in country:
                    country = "Poland"
                else:
                    for i in country:
                        if str(i).isnumeric():
                            country = None
                # print(country)

                try:
                    born = str(B[-1]).strip().split("–")
                    born = born[0]
                    t = ""
                    for b in born:
                        if b.isnumeric():
                            t += b
                    born = int(t)
                except ValueError:
                    born = None
                # print(born)
            except AttributeError:
                born = None
                country = None

            try:
                about = None
                # Pick artist's description here.
                about_block = soup.find_all(
                    'div',
                    class_=re.compile(
                        r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*'))
                for a in about_block:
                    if a.text.strip() == 'Bio':
                        # print("A")
                        about = a.nextSibling.text.strip()
                        break
                # print(about)
            except AttributeError:
                about = None

            artist_data_pack = [name, born, country, about]
            # pack = [name, born, country, about]
            # Updating KEY_INFO dictionary.
            KEY_INFO[url] = db.Artist.key_maker(artist_data_pack)
            # Updating the dB with artist listings.
            TheAuthour.write_artist(*artist_data_pack)

Example #15

0

Show file

File: kazoart.py Project: SVwrite/Web_Scrapers

    def get_artwork_data_slave(self, url):
        soup = TheMiner.fetch_page(url)
        if soup is not None:

            # Field initiation :: Artwork_title, artist, price, seller_id :: (picked),
            # medium, type, dimension, frame, authenticity, about  :: year, support, signature
            # artist_id, Image_loc = None

            seller_id = None
            artist = None
            artwork = None
            price = None

            # Material to be added to technique
            technique = ""

            # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
            if "/painting/" in str(url):
                medium = "Painting"  # (painting or sculpture)
            elif "/sculpture/" in str(url):
                medium = "Sculpture"
            else:
                # So that url leaks don't break the code.
                medium = None

            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None

            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None

            seller_url = str(
                soup.find('div',
                          class_='product-artist').a.get('href')).strip()
            # We want the code to break if this entry is not found so that we can fix it.
            # THE PAGE MUST HAVE A SELLER.

            # Seller_id
            seller_id = self.get_seller_id(seller_url)

            # Artist_id
            artist_url = seller_url
            artist_id = self.get_artist_id(artist_url)

            # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4)
            if seller_id is not None and artist_id is not None and medium is not None:
                A = soup.h1
                B = A.find('div', class_='product-artist')
                artist = str(B.a.text).strip()
                # Artist
                # print(artist)

                artwork = str(A.find('div',
                                     class_='product-name').text).strip()
                # Artwork
                # print(artwork)

                price = str(
                    soup.find('div', class_='product-price').find(
                        'div', class_='p-price-container').text).strip()
                temp = ""
                for i in price:
                    if i.isdigit():
                        temp += i
                    if i == ".":
                        temp += i
                price = float(temp)
                # Price
                # print(price)

                product_details_desc = soup.find('div',
                                                 class_='product-details_desc')
                product_details = product_details_desc.find_all(
                    'div', class_='tech-item')

                for detail in product_details:
                    label = str(detail.find(
                        'div', class_='tech-label').text).strip().upper()
                    value = str(detail.find('div',
                                            class_='tech-value').text).strip()
                    # print(label)
                    # print(value)
                    # For KAZoART, technique(info) goes under Medium, and Material(info) goes under Technique
                    if label == 'TECHNIQUE':
                        technique += " "
                        technique += value
                        technique.strip()
                    elif label == 'TYPE':
                        type_ = value
                    elif label == 'MATERIAL':
                        technique += " "
                        technique = value
                        technique.strip()
                    elif label == 'DIMENSIONS':
                        dimensions = value
                    elif label == 'FRAMING':
                        frame = value
                    elif label == 'QUALITY GUARANTEE':
                        authenticity = value

                    # if that is not here, it'll throw errors.
                    # elif label == ''

                try:
                    about = str(
                        product_details_desc.find(
                            'div', class_='desc text-1').text).strip()
                except AttributeError:
                    about = None

                image_loc = soup.find('div', class_='product-left').find(
                    'div', class_='img-wrapper').img.get('src')
                # print(image_loc)

                # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None,
                #     Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None,
                #      platform=None, image_addr=None, seller_id=None, artist_id=None)

                artwork_bundle = {
                    "artwork_title": artwork,
                    "artist_name": artist,
                    "year": year,
                    "price": price,
                    "Medium": medium,
                    "Type": type_,
                    "Dimensions": dimensions,
                    "Support": support,
                    "Frame": frame,
                    "Signature": signature,
                    "Authenticity": authenticity,
                    "About": about,
                    "platform": self.website.platform,
                    "image_addr": image_loc,
                    "seller_id": seller_id,
                    "artist_id": artist_id,
                    "url": url,
                    "technique": technique
                }

                TheAuthour.write_artwork_price_image(**artwork_bundle)
                # self.write_artwork_data(**artwork_bundle)
            else:
                print(
                    f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}"
                )

Example #16

0

Show file

File: singulart.py Project: SVwrite/Web_Scrapers

    def get_artwork_data_slave(self, url):
        soup = TheMiner.fetch_page(url, ghost=True)
        if soup is not None:

            # Field initiation ::

            artwork = None
            price = None
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None
            # Material to be added to technique
            technique = ""


            seller_id = None
            artist = None
            # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
            if "/painting/" in str(url):
                medium = "Painting"  # (painting or sculpture)
            elif "/sculpture/" in str(url):
                medium = "Sculpture"
            else:
                # So that url leaks don't break the code.
                medium = None

            # Seller_id
            seller_url =
            # We'll let it crash at seller_url not found because that is the way of the world.
            seller_id = self.get_seller_id(seller_url)

            # Artist_id
            artist_url =
            artist_id = self.get_artist_id(artist_url)

            # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4)
            if seller_id is not None and artist_id is not None and medium is not None:



                try :
                    price
                    temp = ""
                    for i in price:
                        if i.isdigit():
                            temp += i
                        if i == ".":
                            temp += i
                    price = float(temp)
                    # Price
                    # print(price)
                except AttributeError:
                    price = None
                except ValueError:
                    price = None

                # RULE : 5
                if price is not None:

                    # Find artist, artwork, year, type_, dimensions, support, frame, signature, authenticity,
                    # about, image_loc(actual url of the image), and technique

                    # Wish the code to break if either Artist's name or Artwork's name are not found.
                    # Artist
                    print(artist)

                    # Artwork
                    print(artwork)

                    try:
                        about =
                    except AttributeError:
                        about = None

                    artwork_bundle = {"artwork_title": artwork, "artist_name": artist, "year": year, "price": price,
                                      "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support,
                                      "Frame": frame, "Signature": signature, "Authenticity": authenticity,
                                      "About": about, "platform": self.website.platform, "image_addr": image_loc,
                                      "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique}

                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else :
                    print(f"Skipping {url}\n PRICE : {price}")
            else:
                print(f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}")

Example #17

0

Show file

    def get_artwork_data_slave(self, url, driver):

        driver.get(url)
        soup = BeautifulSoup(driver.page_source, url)
        if soup is not None:

            # Field initiation ::

            artwork = None
            price = None
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None
            # Material to be added to technique
            technique = ""

            seller_id = None
            artist = None
            medium = None

            # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
            # if "/painting/" in str(url):
            #     medium = "Painting"  # (painting or sculpture)
            # elif "/sculpture/" in str(url):
            #     medium = "Sculpture"
            # else:
            #     # So that url leaks don't break the code.
            #     medium = None

            # Seller_id
            try:
                seller_url = soup.find('div', class_='WncCi').find('a')['href']
                seller_id = self.get_seller_id(seller_url)
            except AttributeError or TypeError:
                # Seller doesn't have a page.
                try:
                    seller_url = soup.find('div', class_='WncCi').text.strip()
                    if seller_url in SELLER_INFO.keys():
                        seller_id = SELLER_INFO.get(seller_url)
                    else:
                        # Make a Kazoart style bundle, and write it to obtain a seller_id.
                        # [seller_url, platform_id(from name), Seller's name, Location, website]
                        bundle = [
                            seller_url, self.website.platform,
                            'EMERGINGARTISTPLATFOM', None, None
                        ]
                        # Writing to db.
                        TheAuthour.write_seller(*bundle)
                        # This should generate the seller_id we so desperately desire.
                        # time.sleep(1)
                        seller_id = SELLER_INFO.get(seller_url)
                except AttributeError:
                    pass

            # We'll let the seller name be seller_url if the url is not found.

            # Artist_id
            try:
                artist_url = soup.find('div', class_='WncCi').a.get('href')
                if str(artist_url).endswith(".com"):
                    artist_url = re.sub('.com', "", artist_url)
                    artist_url = re.sub('emergingartistplatform',
                                        'emergingartistplatform.com',
                                        artist_url)
                artist_id = self.get_artist_id(artist_url)

            except AttributeError:
                try:
                    artist_url = soup.find('div', class_='WncCi').text.strip()
                    country = None
                    a = soup.find_all('pre')
                    for b in a:
                        if b.get('data-hook') == 'description':
                            p = b.find_all('p')
                            for j in p:
                                if 'Country' in j.text or 'country' in j.text or 'COUNTRY' in j.text:
                                    title = j.text.split(":")
                                    country = title[-1].strip()

                    artist_data_pack = [artist_url, None, country, None]
                    # artist_data_pack = [name, born, country, about]
                    # pack = [name, born, country, about]
                    # Updating KEY_INFO dictionary.
                    KEY_INFO[artist_url] = db.Artist.key_maker(
                        artist_data_pack)
                    key = KEY_INFO.get(artist_url)
                    # Updating the dB with artist listings.
                    TheAuthour.write_artist(*artist_data_pack)
                    artist_id = ARTIST_INFO[key]
                except AttributeError:
                    artist_id = None

            # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4)
            if seller_id is not None and artist_id is not None:
                try:
                    a = soup.find_all('span')
                    t = ""
                    for b in a:
                        if b.get('data-hook') == "formatted-primary-price":
                            # print(b.text)
                            for p in b.text:
                                if str(p).isnumeric() or str(p) == ".":
                                    t += p
                    price = float(t) * rate
                    # print(price)
                    # Price
                    # print(price)
                except AttributeError:
                    price = None
                except ValueError:
                    price = None

                # RULE : 5
                if price is not None:

                    # Find artist, artwork, year, type_(N/A), dimensions, support, frame, signature, authenticity,
                    # about, image_loc(actual url of the image), and technique

                    # Wish the code to break if either Artist's name or Artwork's name are not found.
                    # Artist
                    artist = soup.find('div', class_='WncCi').text.strip()
                    # print(artist)

                    # Artwork
                    a = soup.find_all('pre')
                    for b in a:
                        if b.get('data-hook') == 'description':
                            p = b.find_all('p')
                            for j in p:
                                if 'Title' in j.text or 'title' in j.text or 'TITLE' in j.text:
                                    title = j.text.split(":")
                                    artwork = title[-1].strip()
                                    if len(artwork) >= 255:
                                        artwork = artwork[0:255]
                                    # print(artwork)

                                if 'Date' in j.text:
                                    date = j.text.split(":")
                                    year = date[-1].strip()
                                    # print(year)

                                if 'Size' in j.text:
                                    dimensions = j.text.split(":")
                                    dimensions = dimensions[-1].strip()
                                    # print(dimensions)

                                if 'Medium' in j.text:
                                    technique = j.text.split(":")
                                    technique = technique[-1].strip()
                                    # print(technique)

                                if len(j.text.split(
                                        ":")) == 1 and about is None:
                                    about = j.text[-1].strip()

                    # Medium (RULE : 3)
                    if "Sculptures" in self.website.start_url:
                        medium = "Sculpture"
                    else:
                        medium = "Painting"

                    # image_loc
                    image = soup.find('div',
                                      class_='main-media-image-wrapper-hook')
                    image = image.find('div', id='get-image-item-id')
                    image_loc = image.get('href')

                    # print(image_loc)

                    artwork_bundle = {
                        "artwork_title": artwork,
                        "artist_name": artist,
                        "year": year,
                        "price": price,
                        "Medium": medium,
                        "Type": type_,
                        "Dimensions": dimensions,
                        "Support": support,
                        "Frame": frame,
                        "Signature": signature,
                        "Authenticity": authenticity,
                        "About": about,
                        "platform": self.website.platform,
                        "image_addr": image_loc,
                        "seller_id": seller_id,
                        "artist_id": artist_id,
                        "url": url,
                        "technique": technique
                    }

                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else:
                    print(f"Skipping {url}\n PRICE : {price}")
            else:
                print(
                    f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}"
                )
        else:
            print(f"Soup not returned for {url}")