コード例 #1
0
ファイル: singulart.py プロジェクト: SVwrite/Web_Scrapers
    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave and get_seller_id
        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        if soup is not None:

            # Seller's Name
            print(seller_name)
            # Code will break if seller's name is not found

            # Location
            try:
                print(location)
            except AttributeError:
                location = None

            # Website
            try:
                print(website)
            except AttributeError:
                website = None
            except TypeError:
                website = None

            bundle = [url, self.website.platform, seller_name, location, website]
            print(bundle)
            TheAuthour.write_seller(*bundle)
コード例 #2
0
ファイル: artsper_new.py プロジェクト: SVwrite/Web_Scrapers
    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave
        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        # print("A")
        if soup is not None:
            # print("B")

            A = soup.find('div', id='top-seller')
            seller_name = A.h1.text.strip()
            # print(seller_name)
            # Code will break if seller's name is not found

            try:
                location = A.find('p',
                                  class_="subtitle").text.strip().split(',')
                location = location[-1].strip()
                # print(location)
            except AttributeError:
                location = None
            try:
                website = str(soup.find('ul', id="websites").a['href']).strip()
                # print(website)
            except AttributeError:
                website = None
            except TypeError:
                website = None

            bundle = [
                url, self.website.platform, seller_name, location, website
            ]
            # print(bundle)
            TheAuthour.write_seller(*bundle)
コード例 #3
0
ファイル: singulart.py プロジェクト: SVwrite/Web_Scrapers
    def get_seller_id(self, seller_url) -> int:
        # Fetches seller_data, writes it in db, and returns seller_id.
        # bundle = [seller_url, self.website.platform, 'KAZoART', None, url]
        seller_id = None

        if seller_url is not None:
            if seller_url in SELLER_INFO.keys():
                seller_id = SELLER_INFO.get(seller_url)
                # print(seller_id)
            else:
                # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data
                # again with seller_url
                self.get_seller_data(seller_url)
                # wait for a second to make sure that transaction is smooth. Activate this line if errors are thrown.
                # time.sleep(1)
                # Try to fetch seller data again.
                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    # Make a Kazoart style bundle, and write it to obtain a seller_id.
                    bundle = [seller_url, self.website.platform, 'BAREBONES', None, seller_url]
                    # Writing to db.
                    TheAuthour.write_seller(*bundle)
                    # This should generate the seller_id we so desperately desire.
                    # time.sleep(1)
                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    print("FATAL ERROR :: Seller_id not found.")
        else:
            print("FATAL ERROR :: Seller_id not found.")
        # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering
        # data beyond rule 3 check .
        return seller_id
コード例 #4
0
ファイル: kazoart.py プロジェクト: SVwrite/Web_Scrapers
    def get_seller_id(self, seller_url) -> int:
        # Fetches seller_data, writes it in db, and returns seller_id.
        # bundle = [seller_url, self.website.platform, 'KAZoART', None, url]
        seller_id = None

        if seller_url is not None:
            if seller_url in SELLER_INFO.keys():
                seller_id = SELLER_INFO.get(seller_url)
                # print(seller_id)
            else:
                # Process and create the bundle here.
                bundle = [
                    seller_url, self.website.platform, 'KAZoART', None,
                    seller_url
                ]
                # Writing to db.
                TheAuthour.write_seller(*bundle)

                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    print("FATAL ERROR :: Seller_id not found.")
        else:
            print("FATAL ERROR :: Seller_id not found.")
        # Let's return seller_id, even if it's None.
        return seller_id
コード例 #5
0
ファイル: artsy.py プロジェクト: SVwrite/Web_Scrapers
    def get_seller_id(self, seller_url) -> int:
        # Fetches seller_data, writes it in db, and returns seller_id.
        # bundle = [seller_url, self.website.platform, 'KAZoART', None, url]
        # print("GET SELLER ID")
        seller_id = None

        if seller_url is not None:
            if seller_url in SELLER_INFO.keys():
                seller_id = SELLER_INFO.get(seller_url)
                return seller_id
                # print(seller_id)
            else:
                # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data
                # with seller_url
                self.get_seller_data(seller_url)
                # Try to fetch seller data again.
                if seller_url in SELLER_INFO.keys():
                    seller_id = SELLER_INFO.get(seller_url)
                    # If it is not a url, get_seller_data will fail to make an entry.In that case we move to the next part.
                else:
                    # Make a Kazoart style bundle, and write it to obtain a seller_id.
                    # bundle = [seller_url, platform, Seller's name, location, website]
                    bundle = [
                        seller_url, self.website.platform, seller_url, None,
                        seller_url
                    ]
                    # Writing to db.
                    TheAuthour.write_seller(*bundle)
                    # This should generate the seller_id we so desperately desire.
                    # time.sleep(1)
                if seller_url in SELLER_INFO.keys():
                    # This will always run, unless the program is failing unexpectedly.
                    seller_id = SELLER_INFO.get(seller_url)
                else:
                    print("FATAL ERROR :: Seller_id not found.")
        else:
            print("FATAL ERROR :: Seller_id not found.")
        # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering
        # data beyond rule 3 check .
        return seller_id
コード例 #6
0
ファイル: artsper_new.py プロジェクト: SVwrite/Web_Scrapers
    def get_artwork_data_slave(self, url):
        soup = TheMiner.fetch_page(url)
        if soup is not None:

            # Field initiation :: Artwork_title, artist, price, seller_id ,
            # medium, type, dimension, frame, authenticity, about  :: year, support, signature
            # artist_id, Image_loc = None

            seller_id = None
            artist = None
            artwork = None
            price = None
            medium = None  # (painting or sculpture)
            technique = ""  # Material and style
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None

            try:
                # PRICE
                A = soup.find('section', id='informations')
                price = str(A.find('p',
                                   class_='media-price price').text).strip()
                number = ''
                for p in price:
                    if p == '-':
                        break
                    if p.isdigit():
                        number += str(p)
                    if p == ".":
                        number += str(p)

                price = float(number)
                # print(price)
            except AttributeError:
                pass
            except TypeError:
                pass

            # Rule : 5
            if price is not None:

                # Seller_id
                try:
                    seller_url = soup.find(
                        'div', id='top-seller').find('a').get('href')
                    if 'galeries-d-art' in str(seller_url):
                        seller_url = re.sub('galeries-d-art', 'art-galleries',
                                            seller_url)

                    # If seller_url is found.
                    seller_id = self.get_seller_id(seller_url)
                except AttributeError:
                    # seller_id = None
                    # There are pages where the seller has no other page. Then we make the url ourselves.

                    seller_url = soup.find('div', id='top-seller').find(
                        'p', class_='highlight-title').text
                    seller_url = str(seller_url).strip()

                    if seller_url in SELLER_INFO:
                        seller_id = SELLER_INFO[seller_url]
                    else:
                        location = soup.find('div', id='top-seller').find(
                            'p', class_='subtitle').text.strip().split(',')
                        location = str(location[-1]).strip()

                        seller_name = seller_url

                        bundle = [
                            seller_url, self.website.platform, seller_name,
                            location, None
                        ]

                        # We write the seller info directly and fetch the seller_id
                        TheAuthour.write_seller(*bundle)
                        seller_id = SELLER_INFO[seller_url]

                # Artist_id
                try:
                    artist_url = soup.find('section', id='informations').find(
                        'div', class_='relative').a.get('href')
                    if "oeuvres-d-art-contemporain" in artist_url:
                        re.sub("oeuvres-d-art-contemporain",
                               "contemporary-artworks", artist_url)
                    artist_id = self.get_artist_id(artist_url)
                except AttributeError:
                    artist_id = None
                    print("\n\n\n\n\n")
                    print(url)
                    print("\n\n\n\n\n")
                    time.sleep(50)

                # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
                la = str(url).split('/')
                if 'painting' in la:
                    medium = "Painting "  # (painting or sculpture)
                elif 'sculpture' in la:
                    medium = "Sculpture"
                else:
                    # So that url leaks don't break the code.
                    medium = None

                # IF either the seller id or artist_id are missing, escape the rest. (Rule : 3)
                # If medium is neither Paintings, not Sculptures. We don't fetch data. ( Rule : 2)
                if seller_id is not None and artist_id is not None and medium is not None:
                    # ______________________________MAIN DATA FETCHING________________________
                    A = soup.find('section', id='informations')
                    B = A.find('div', class_='relative')

                    # ARTIST'S NAME
                    artist = B.find('span',
                                    class_='primary-title').text.strip()
                    # print(artist)

                    # ARTWORK'S NAME
                    C = B.find('span', class_='secondary-title').text.strip()
                    artwork_ = C.split(',')
                    artwork_title = ""
                    for a in range(len(artwork_) - 1):
                        if a == 0:
                            artwork_title = artwork_[a]
                            continue
                        artwork_title = artwork_title + ", " + artwork_[
                            a].strip()
                    artwork = artwork_title
                    # print(artwork)

                    try:
                        # ARTWORK YEAR
                        year = C.split(',')[-1].strip()
                        # print(year)
                    except IndexError:
                        pass
                        # year = None

                    try:
                        # Image url
                        B = A.find('div', id='img-container')
                        image_loc = B.find('img',
                                           id='img_original')['data-src']
                        # print(image_loc)
                    except AttributeError:
                        pass

                    # Contains:: image, dimensions, medium, type, Frame, Support, authenticity, signature
                    try:
                        D = soup.find('div', id='tabs-description').ul
                        E = D.find_all('li')

                        for e in E:
                            a = e.text
                            # Dimensions
                            if 'Dimensions' in a and 'About the artwork' not in a and 'Support' not in a:
                                dimensions = e.find(
                                    'p',
                                    class_='pull-right').strong.text.strip()
                                dim = True
                                # print(dimensions)
                                continue

                            # Medium (Sculpture/Painting) and Technique
                            if 'Medium' in a and 'About the artwork' not in a:
                                technique = e.find(
                                    'p', class_='pull-right').text.split("   ")
                                # print(technique)
                                temp = ""
                                for t in technique:
                                    if t != "":
                                        temp += t.strip()
                                        temp += " "
                                # medium = medium[0]
                                # technique = medium[1]
                                technique = temp
                                # print(technique)
                                continue

                            # Type
                            if 'Type' in a and 'About the artwork' not in a:
                                type_ = e.find('p',
                                               class_='pull-right text-right'
                                               ).text.strip().split('  ')[0]
                                # print(type_)
                                continue

                            # Support (base)
                            if 'Support' in a and 'About the artwork' not in a:
                                try:
                                    f = e.find('p',
                                               class_='pull-right text-right'
                                               ).text.strip().split('  ')
                                    support = f[0] + '. ' + f[1].strip('\n')
                                    f = e.find(
                                        'p', class_='pull-right text-right'
                                    ).strong.text.strip().strip('\n')
                                    support += f
                                except IndexError:
                                    support = e.find(
                                        'p', class_='pull-right text-right'
                                    ).text.strip()
                                # print(support)
                                continue

                            # Framing
                            if 'Framing' in a and 'About the artwork' not in a:
                                frame = e.find(
                                    'p', class_='pull-right').text.strip()
                                # print(frame)
                                continue

                            # Signature
                            if 'Signature' in a and 'About the artwork' not in a:
                                signature = e.find(
                                    'p', class_='pull-right').text.strip()
                                # print(signature)
                                continue

                            # Authenticity
                            if 'Authenticity' in a and 'About the artwork' not in a:
                                authenticity = e.find(
                                    'p',
                                    class_='pull-right text-right').text.strip(
                                    )
                                # print(authenticity)
                                continue

                            # Artwork Description
                            if 'About the artwork' in a:
                                about = e.find('p', class_="marg-bot-10")
                                if about is not None:
                                    a = e.find(
                                        'div',
                                        class_=
                                        "description-catalog see-more text-justify"
                                    ).text.strip()
                                    about = about.text.strip()
                                    about += a
                                else:
                                    about = e.find('p', class_='').text.strip()
                                continue
                                # print(about)
                    except AttributeError:
                        pass

                        # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None,
                        #     Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None,
                        #      platform=None, image_addr=None, seller_id=None, artist_id=None)

                    artwork_bundle = {
                        "artwork_title": artwork,
                        "artist_name": artist,
                        "year": year,
                        "price": price,
                        "Medium": medium,
                        "Type": type_,
                        "Dimensions": dimensions,
                        "Support": support,
                        "Frame": frame,
                        "Signature": signature,
                        "Authenticity": authenticity,
                        "About": about,
                        "platform": self.website.platform,
                        "image_addr": image_loc,
                        "seller_id": seller_id,
                        "artist_id": artist_id,
                        "url": url,
                        "technique": technique
                    }
                    # print(artwork_bundle)
                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else:
                    print(
                        f"SELLER ID :: {seller_id},\nARTIST ID :: {artist_id}")
            else:
                # If the price is not available, we skip the entire process.
                print(f"PRICE NOT FOUND : {price} at {url}")
        else:
            print(f"\n\n\n\n\nURL DIDN'T RETURN : {url}\n\n\n\n\n")
コード例 #7
0
ファイル: artsy.py プロジェクト: SVwrite/Web_Scrapers
    def get_seller_data(self, url):
        # Caller :: get_artwork_data_slave and get_seller_id
        # We get to here only after we do not find the seller's info in SELLER_INFO
        # print("GET SELLER DATA")

        visited.discard(url)
        soup = TheMiner.fetch_page(url)

        if soup is not None:
            # print("GET SELLER DATA: SOUP RETURNED")

            seller_name = None
            try:
                # Seller's Name

                seller_box = soup.find('div', id='jumpto--PartnerHeader')
                seller_name = seller_box.h1.text.strip()

                # print(seller_name)
                # Code will break if seller's name is not found
            except AttributeError:
                pass

            if seller_name is not None:
                # print(f"SELLER NAME : {seller_name}")
                # Location
                try:
                    # Location is not available here.
                    location = ""
                    locatio = seller_box.h1.nextSibling()
                    # print(type(locatio))
                    try:
                        location = locatio.text
                    except AttributeError:
                        for l in locatio:
                            location += l.text
                            location += " "
                    # print(location)
                except AttributeError:
                    location = None
                except TypeError:
                    location = None

                # Website
                try:
                    website = soup.find_all('a')
                    for web in website:
                        if "http" in str(web.get('href')):
                            website = web.get('href')
                            print(web.get('href'))
                            break
                    # print(website)
                except AttributeError:
                    website = None
                except IndexError:
                    website = None

                bundle = [
                    url, self.website.platform, seller_name, location, website
                ]
                # print(bundle)
                TheAuthour.write_seller(*bundle)
コード例 #8
0
    def get_artwork_data_slave(self, url, driver):

        driver.get(url)
        soup = BeautifulSoup(driver.page_source, url)
        if soup is not None:

            # Field initiation ::

            artwork = None
            price = None
            type_ = None
            dimensions = None
            frame = None
            authenticity = None
            about = None
            artist_id = None
            image_loc = None
            year = None
            support = None
            signature = None
            # Material to be added to technique
            technique = ""

            seller_id = None
            artist = None
            medium = None

            # Medium must always have "Painting" or "Sculpture" (RULE :: 2)
            # if "/painting/" in str(url):
            #     medium = "Painting"  # (painting or sculpture)
            # elif "/sculpture/" in str(url):
            #     medium = "Sculpture"
            # else:
            #     # So that url leaks don't break the code.
            #     medium = None

            # Seller_id
            try:
                seller_url = soup.find('div', class_='WncCi').find('a')['href']
                seller_id = self.get_seller_id(seller_url)
            except AttributeError or TypeError:
                # Seller doesn't have a page.
                try:
                    seller_url = soup.find('div', class_='WncCi').text.strip()
                    if seller_url in SELLER_INFO.keys():
                        seller_id = SELLER_INFO.get(seller_url)
                    else:
                        # Make a Kazoart style bundle, and write it to obtain a seller_id.
                        # [seller_url, platform_id(from name), Seller's name, Location, website]
                        bundle = [
                            seller_url, self.website.platform,
                            'EMERGINGARTISTPLATFOM', None, None
                        ]
                        # Writing to db.
                        TheAuthour.write_seller(*bundle)
                        # This should generate the seller_id we so desperately desire.
                        # time.sleep(1)
                        seller_id = SELLER_INFO.get(seller_url)
                except AttributeError:
                    pass

            # We'll let the seller name be seller_url if the url is not found.

            # Artist_id
            try:
                artist_url = soup.find('div', class_='WncCi').a.get('href')
                if str(artist_url).endswith(".com"):
                    artist_url = re.sub('.com', "", artist_url)
                    artist_url = re.sub('emergingartistplatform',
                                        'emergingartistplatform.com',
                                        artist_url)
                artist_id = self.get_artist_id(artist_url)

            except AttributeError:
                try:
                    artist_url = soup.find('div', class_='WncCi').text.strip()
                    country = None
                    a = soup.find_all('pre')
                    for b in a:
                        if b.get('data-hook') == 'description':
                            p = b.find_all('p')
                            for j in p:
                                if 'Country' in j.text or 'country' in j.text or 'COUNTRY' in j.text:
                                    title = j.text.split(":")
                                    country = title[-1].strip()

                    artist_data_pack = [artist_url, None, country, None]
                    # artist_data_pack = [name, born, country, about]
                    # pack = [name, born, country, about]
                    # Updating KEY_INFO dictionary.
                    KEY_INFO[artist_url] = db.Artist.key_maker(
                        artist_data_pack)
                    key = KEY_INFO.get(artist_url)
                    # Updating the dB with artist listings.
                    TheAuthour.write_artist(*artist_data_pack)
                    artist_id = ARTIST_INFO[key]
                except AttributeError:
                    artist_id = None

            # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4)
            if seller_id is not None and artist_id is not None:
                try:
                    a = soup.find_all('span')
                    t = ""
                    for b in a:
                        if b.get('data-hook') == "formatted-primary-price":
                            # print(b.text)
                            for p in b.text:
                                if str(p).isnumeric() or str(p) == ".":
                                    t += p
                    price = float(t) * rate
                    # print(price)
                    # Price
                    # print(price)
                except AttributeError:
                    price = None
                except ValueError:
                    price = None

                # RULE : 5
                if price is not None:

                    # Find artist, artwork, year, type_(N/A), dimensions, support, frame, signature, authenticity,
                    # about, image_loc(actual url of the image), and technique

                    # Wish the code to break if either Artist's name or Artwork's name are not found.
                    # Artist
                    artist = soup.find('div', class_='WncCi').text.strip()
                    # print(artist)

                    # Artwork
                    a = soup.find_all('pre')
                    for b in a:
                        if b.get('data-hook') == 'description':
                            p = b.find_all('p')
                            for j in p:
                                if 'Title' in j.text or 'title' in j.text or 'TITLE' in j.text:
                                    title = j.text.split(":")
                                    artwork = title[-1].strip()
                                    if len(artwork) >= 255:
                                        artwork = artwork[0:255]
                                    # print(artwork)

                                if 'Date' in j.text:
                                    date = j.text.split(":")
                                    year = date[-1].strip()
                                    # print(year)

                                if 'Size' in j.text:
                                    dimensions = j.text.split(":")
                                    dimensions = dimensions[-1].strip()
                                    # print(dimensions)

                                if 'Medium' in j.text:
                                    technique = j.text.split(":")
                                    technique = technique[-1].strip()
                                    # print(technique)

                                if len(j.text.split(
                                        ":")) == 1 and about is None:
                                    about = j.text[-1].strip()

                    # Medium (RULE : 3)
                    if "Sculptures" in self.website.start_url:
                        medium = "Sculpture"
                    else:
                        medium = "Painting"

                    # image_loc
                    image = soup.find('div',
                                      class_='main-media-image-wrapper-hook')
                    image = image.find('div', id='get-image-item-id')
                    image_loc = image.get('href')

                    # print(image_loc)

                    artwork_bundle = {
                        "artwork_title": artwork,
                        "artist_name": artist,
                        "year": year,
                        "price": price,
                        "Medium": medium,
                        "Type": type_,
                        "Dimensions": dimensions,
                        "Support": support,
                        "Frame": frame,
                        "Signature": signature,
                        "Authenticity": authenticity,
                        "About": about,
                        "platform": self.website.platform,
                        "image_addr": image_loc,
                        "seller_id": seller_id,
                        "artist_id": artist_id,
                        "url": url,
                        "technique": technique
                    }

                    TheAuthour.write_artwork_price_image(**artwork_bundle)
                else:
                    print(f"Skipping {url}\n PRICE : {price}")
            else:
                print(
                    f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}"
                )
        else:
            print(f"Soup not returned for {url}")