def miner(self): # Miner's track : We land on artwork listings page. We pick the listings from there. # We pick the Sellers and Artists from artwork pages. # From artwork pages we fetch the artwork for sale for artists listed. self.get_artist_listings() # print(kazoart.artist_listings) # print(self.artist_listings) # print("ARTIST LISTINGS") # print(len(self.artist_listings)) # time.sleep(10) # That the pages where we discarded the links can be visited as well for link in self.first_prod_list: visited.discard(link) self.get_artwork_listings_master() # get_artwork_listings_master -> get_artwork_listings_slave -> get_artist_data -> write_artist_data # So we're done with artist data. # print(f"ARTWORK LISTINGS, {len(self.artwork_listings)}") # print(len(self.artwork_listings)) # time.sleep(10) self.get_artwork_data_master() # DATA COLLECTION COMPLETED FOR THIS MODULE. # DOWNLOADING IMAGES NOW. TheMiner.sir_image_manager()
def miner(self): self.get_artist_listings() print(len(self.artist_listings)) self.get_artwork_listings_master() # get_artwork_listings_master -> get_artwork_listings_slave -> get_artist_data -> write_artist_data # So we're done with artist data. print(len(self.artwork_listings)) self.get_artwork_data_master() # DATA COLLECTION COMPLETED FOR THIS MODULE. # DOWNLOADING IMAGES NOW. TheMiner.sir_image_manager()
def get_seller_data(self, url): # Caller :: get_artwork_data_slave and get_seller_id visited.discard(url) soup = TheMiner.fetch_page(url) if soup is not None: # Seller's Name print(seller_name) # Code will break if seller's name is not found # Location try: print(location) except AttributeError: location = None # Website try: print(website) except AttributeError: website = None except TypeError: website = None bundle = [url, self.website.platform, seller_name, location, website] print(bundle) TheAuthour.write_seller(*bundle)
def recurrent(i_url, depth): soup = TheMiner.fetch_page(i_url) if soup is not None: product_list = soup.find('div', class_='product-list-wrapper') product_list = product_list.find_all('div', class_='grid-item') for product in product_list: item_price = str( product.find( 'div', class_='grid-item-price').text).strip().upper() # Discard the data that does not have a price. if not item_price == "SOLD": product_link = str(product.a['href']).strip() # Discarding urls that do not take us to paintings and sculptures. (RULE : 1) if "/sculpture/" in product_link or "/painting/" in product_link: if product_link not in self.artwork_listings: self.artwork_listings.append(product_link) # Get artist data if depth is "1", if depth is more than "1" ignore this block. # To pick the artist's data, from the first page of the listings. if depth == 1: # Calling the function to fetch the artist data, and return artist_id self.get_artist_data(soup, i_url) next_page = soup.find('div', class_='page-browser') if next_page is not None: next_page = next_page.find('div', class_='page-browser-numbers') next_page = next_page.find_all('a', class_='page-browser-item ') for next_ in next_page: next_ = self.website.domain + str(next_['href']) depth += 1 recurrent(next_page, depth)
def key_maker(artist_url): visited.discard(artist_url) soup = TheMiner.fetch_page(artist_url) if soup is not None: artist_resume = soup.find('div', class_='artist-resume').find('div', class_='artist-resume_text') name = artist_resume.h1.text.strip() print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: try: country = artist_resume.find('p', class_='location').text.strip().split('\n') country = country[0].split(',') country = country[-1].strip() print(country) except AttributeError: country = None about = soup.find('div', id='about').text.strip() # About will either be found and be some text or be None. # print(about) artist_data_pack = [name, None, country, about] key = db.Artist.key_maker(artist_data_pack) # pack = [name, born, country, about] return key else: return None
def get_seller_data(self, url): # Caller :: get_artwork_data_slave visited.discard(url) soup = TheMiner.fetch_page(url) # print("A") if soup is not None: # print("B") A = soup.find('div', id='top-seller') seller_name = A.h1.text.strip() # print(seller_name) # Code will break if seller's name is not found try: location = A.find('p', class_="subtitle").text.strip().split(',') location = location[-1].strip() # print(location) except AttributeError: location = None try: website = str(soup.find('ul', id="websites").a['href']).strip() # print(website) except AttributeError: website = None except TypeError: website = None bundle = [ url, self.website.platform, seller_name, location, website ] # print(bundle) TheAuthour.write_seller(*bundle)
def recurr(url): soup = TheMiner.fetch_page(url, ghost=True) if soup is not None: # Because singulart keeps blocking ips, we'll ship everything inside try-except statements. try: # artist_blocks = soup.find_all('div', class_='artist-container') artist_blocks = soup.find_all('figure', class_='pic-artist') print(len(artist_blocks)) for artist in artist_blocks: link = artist.figcaption.h2.a.get('href') if self.website.domain not in link: link = self.link_maker(list) self.artist_listings.append(link) # print(self_artist_listings) # next pages next_pages = soup.find('div', class_='pagerfanta').find('nav') next_pages = next_pages.find_all('a') for next_ in next_pages: link = next_.get('href') if self.website.domain not in link: link = self.link_maker(link) if link not in self.listy: self.listy.append(link) # print(listy) # print(len(listy)) with concurrent.futures.ThreadPoolExecutor() as executor: trig = executor.map(recurr, self.listy) for trigger in trig: pass except AttributeError: visited.discard(url) pass
def gal(url): soup_ = TheMiner.fetch_page(url) if soup_ is not None: try: artist_url = soup_.find( 'div', class_=re.compile( r'Box-sc-15se88d-0 GridColumns__Cell-sc-1g9p6xx-1 cviiXL\.*' )) artist_url = artist_url.find( 'a', class_=re.compile( r'Box-sc-15se88d-0 Flex-cw39ct-0\.*'))['href'] if self.website.domain in artist_url: pass else: artist_url = self.link_maker(artist_url) if artist_url not in self.artist_listings: self.artist_listings.append(artist_url) except AttributeError: pass except TypeError: pass
def get_artist_listings(self): soup = TheMiner.fetch_page(self.website.start_url) if soup is not None: artist_thumbnails = for artist in artist_thumbnails: self.artist_listings.append(artist)
def recurrent(i_url, depth): soup = TheMiner.fetch_page(i_url) if soup is not None: # Gather a list of all the products. for product in product_list: # Check for price, discard the links that do not have a price. item_price = # Discard the data that does not have a price. (RULE : 6) if not item_price == "SOLD": product_link = # Discarding urls that do not take us to paintings and sculptures. (RULE : 1) if "/sculpture/" in product_link or "/painting/" in product_link: if product_link not in self.artwork_listings: self.artwork_listings.append(product_link) # Get artist data if depth is "1", if depth is more than "1" ignore this block. # To pick the artist's data, from the first page of the listings. if depth == 1: # Calling the function to fetch the artist data, and return artist_id self.get_artist_data(soup, i_url) # Find the links to all other pages. next_pages = for next_page in next_pages: next_page = depth += 1 recurrent(next_page, depth)
def seller_info(self, soup): seller_bundle =[] # Seller name # Seller's website # Seller's location # Return seller_id, seller_bundle # Write data to table seller's in db. try: A = soup.find('div', id = 'top-seller') B = A.find('a') seller_name = str(B.text).strip() location = str(A.find('p', class_='subtitle')).strip() # if seller and location are already recorded in the global seller variable, we fetch the seller_id and # return it . seller_name = "_".join([seller_name, location]) if seller_name in SELLER_INFO.keys(): seller_id = SELLER_INFO(seller_name) print(f"We have a seller for seller id {seller_id}, named {seller_name}") return seller_id, None link = B['href'] if 'galeries-d-art' in str(link): link = re.sub('galeries-d-art', 'art-galleries', link) except AttributeError: link = None except TypeError: link = None if link is not None: # Moving to seller page now.!! # Read the name and location before moving to the next page. soup = TheMiner.fetch_page(link) # visited.discard(link) if soup is not None: try: A = soup.find('div', id = 'top-seller') seller_name = A.h1.text.strip() # print(seller_name) except AttributeError: return 1, seller_bundle try: location = A.find('p', class_="subtitle").text.strip() # print(location) except AttributeError: location = None try: website = str(soup.find('ul', id="websites").a['href']).strip() # print(website) except AttributeError: website = None except TypeError: website = None seller_bundle.append(seller_name) seller_bundle.append(location) seller_bundle.append(website) return 0, seller_bundle return 1, seller_bundle
def get_artist_listings(self): soup = TheMiner.fetch_page(self.website.start_url) if soup is not None: artist_thumbnails = soup.find('div', class_='artists-thumbnails') artist_thumbnails = artist_thumbnails.find_all('div', class_='artists-thumbnails__item') for artist in artist_thumbnails: self.artist_listings.append(str(artist.a['href']).strip())
def miner(self): # self.get_artist_listings() # print(kazoart.artist_listings) # print(len(self.artist_listings)) self.get_artwork_listings_master() # get_artwork_listings_master -> get_artwork_listings_slave # We're still not done with artist data print(len(self.artwork_listings)) self.get_artwork_data_master() # DATA COLLECTION COMPLETED FOR THIS MODULE. # DOWNLOADING IMAGES NOW. print("downloading images now.") TheMiner.sir_image_manager(chunk_size=100)
def main(): # art_page_url = 'https://www.artsper.com/us/contemporary-artworks/painting/1147236/les-deux-freres' art_page_url = "https://www.artsper.com/in/contemporary-artworks/painting/189196/candy-zinzin-de-lespace" artsperpainters = Website('https://www.artsper.com', 'https://www.artsper.com/us/contemporary-artists/youngtalents/painters?', "ARTSPER") a = Artsper(artsperpainters) # print(a.get_art_data_core(art_page_url)) a.seller_info(TheMiner.fetch_page(art_page_url))
def main(): start = time.perf_counter() # Creating SELLER_INFO sellers = db.Sellers() sellers.read_data_sellers() # Creating ARTIST_INFO artists = db.Artist() artists.read_artist_data() artsperpainters = Website( 'https://www.artsper.com', 'https://www.artsper.com/us/contemporary-artists/youngtalents/painters?', "ARTSPER") a_m = Artsper(artsperpainters) a_m.artsper_mine() finish = time.perf_counter() print( f"Lap Completed in {round(finish - start, 2)}, seconds.\n Starting sculptures" ) artspersculptors = Website( 'https://www.artsper.com', 'https://www.artsper.com/us/contemporary-artists/youngtalents/sculptors-artists', "ARTSPER") a_m = Artsper(artspersculptors) a_m.artsper_mine() finish = time.perf_counter() print( f"Lap Completed in {round(finish - start, 2)}, seconds.\n Downloading and updating images" ) TheMiner.sir_image_manager() finish = time.perf_counter() print(f"Finished in {round(finish - start, 2)}, seconds")
def get_listing_pages(self, url): print("Fetching All Listing pages") soup = TheMiner.fetch_page(url) # Pop out the url from visited so that it can be used again while fetching artists. visited.remove(url) self.listing_pages.append(url) listings = soup.find('div', class_="paginator") listings = listings.find_all('a') for lis in listings: u = self.website.url_maker(lis['href']) # Dealing with sites that throw the scraper on french webpages of the artworks.!! if "oeuvres-d-art-contemporain" in u: re.sub("oeuvres-d-art-contemporain", "contemporary-artworks", u) if u not in self.listing_pages: self.listing_pages.append(u)
def get_artwork_listings_slave(self, url): # Runs on artist_listings. soup = TheMiner.fetch_page(url) if soup is not None: # Gather a list of all the products. main_ = soup.find('main', id='main') main_ = main_.find('div', class_=re.compile(r'Box-sc-15se88d-0\.*')) # print(main.prettify()) main_ = main_.find( 'div', class_=re.compile( r'Box-sc-15se88d-0 Shelf__Container-sc-1kdkue-0\.*')) # print(main.prettify()) try: main1 = main_.find( 'div', class_=re.compile( r'Box-sc-15se88d-0 FullBleed-g9qwfe-0\.*')) product_list = main1.find_all('li') except AttributeError: try: main1 = main_.find( 'div', class_=re.compile( r'Box-sc-15se88d-0 FullBleed-g9qwfe-0\.*')) product_list = main1.find_all('li') # print("REGEX") except AttributeError: product_list = None if product_list is not None: # print(product_list) for product in product_list: # All the products here are "Available for Sale." if self.website.domain not in product.a['href']: product_link = self.link_maker(product.a['href']) else: product_link = product.a['href'] if product_link not in self.artwork_listings: # print(product_link) self.artwork_listings.append(product_link) # Sending the soup to fetch artist's data, make artist listings. self.get_artist_data(soup, url)
def recurr(url): soup = TheMiner.fetch_page(url) if soup is not None: try: container = soup.find( 'div', class_=re.compile( r'LoadingArea__Container-sc-1cnoyb0-2\.*')) artist_thumbnails = container.find_all( 'div', class_=re.compile( r'GridItem__ArtworkGridItem-l61twt-3\.*')) # print(container.prettify()) for artist in artist_thumbnails: arti = artist.div.a['href'] if self.website.domain in arti: artist = arti else: artist = self.link_maker(arti) if artist not in self.first_prod_list: self.first_prod_list.append(artist) except AttributeError: print("Something went wrong for url {}") try: next_pages = soup.find( 'nav', class_=re.compile( r'Box-sc-15se88d-0 Text-sc-18gcpao-0 ibHUpM\.*')) next_pages = next_pages.find_all( 'a', class_=re.compile(r'Link-oxrwcw-0\.*')) for a in next_pages: link = self.link_maker(a['href']) if link not in listy: listy.append(link) except AttributeError: pass with concurrent.futures.ThreadPoolExecutor() as executor: beta = executor.map(recurr, listy) for alpha in beta: pass
def recurrent(i_url, depth): soup = TheMiner.fetch_page(i_url) if soup is not None: artwork = soup.find('div', class_="catalog") artwork = artwork.find_all('figure') for art in artwork: # If listing is sold, don't pick it up. try: sold = art.find('p', class_='price soldout sold').text sold = True except AttributeError: sold = False link = art.a['href'] if 'oeuvres-d-art-contemporain' in link: link = re.sub('oeuvres-d-art-contemporain', 'contemporary-artworks', link) if link not in self.artwork_listings and not sold: la = str(link).split('/') if 'painting' in la or 'sculpture' in la: self.artwork_listings.append(link) if depth == 1: # Calling the function to fetch the artist data, and return artist_id self.get_artist_data(soup, i_url) # This block picks the urls of pages for artists who have listings on more than one pages. # And launches the code to pick the artwork_listings and artist data try: next_ = [] listings = soup.find('div', class_="paginator") listings = listings.find_all('a') for li in listings: ur = self.website.url_maker(li['href']) next_.append(ur) # print(ur) for ur in next_: recurrent(ur, depth + 1) except AttributeError: # For Artists who do not have a second listings page. They'll throw an AttributeError pass
def recur(i_url, depth): soup = TheMiner.fetch_page(i_url) if soup is not None: figures = soup.find_all('figure') for figure in figures: self.artist_listings.append(str(figure.a['href']).strip()) if depth == 1: next_ = [] listings = soup.find('div', class_="paginator") listings = listings.find_all('a') for lis in listings: u = self.website.url_maker(lis['href']) # Dealing with sites that throw the scraper on french webpages of the artworks.!! if "oeuvres-d-art-contemporain" in u: re.sub("oeuvres-d-art-contemporain", "contemporary-artworks", u) if u not in next_: next_.append(u) for link in next_: recur(link, depth + 1)
def artist_id_slave(self, artist_url): visited.discard(artist_url) soup = TheMiner.fetch_page(artist_url) if soup is not None: self.get_artist_data(soup, artist_url) # Getting the key from KEY_INFO if artist_url in KEY_INFO.keys(): key = KEY_INFO.get(artist_url) # Getting artist_id using the key from ARTIST_INFO if key in ARTIST_INFO.keys(): artist_id = ARTIST_INFO.get(key) return artist_id else: print("ARTIST_ID_SLAVE : Artist id not in ARTIST_INFO") return None else: print("ARTIST_ID_SLAVE : Could not find artist_id") return None else: print("ARTIST_ID_SLAVE : Soup not returned") return None
def get_artwork_listings_slave(self, url): soup = TheMiner.fetch_page(url, ghost=True) # Artist's info and artwork listings are available on the same page. if soup is not None: try: name = soup.find('div', class_='artist-intro').find('div', class_='content').h1.text # Name will cause the crash if the page is not returned block = soup.find_all('div', class_='artist-container artist-container--details') print(f"BLOCK : {len(block)}") try: for chunk in block: items = chunk.find_all('figure', class_='artwork-item artwork-item--details') print(f"ITEMS : {len(items)}") for piece in items: paise = piece.find('div', class_='meta').text.strip() # print(paise) if "Sold" not in str(paise): # print("B") a = piece.find('a')['href'] if self.website.domain not in a: a = self.link_maker(a) if a not in self.artwork_listings: self.artwork_listings.append(a) except AttributeError: # print("A") pass self.get_artist_data(soup, url) except AttributeError: print("B") # Urls that get blocked are discarded from visited and added to listy for a recall. (linear if listy is # small and multithreaded if listy is large enough till, its brought of size. visited.discard(url) self.listy.append(url)
def get_artwork_data_slave(url): soup = TheMiner.fetch_page(url, ghost=True) if soup is not None: # Initiation try: # Artist_url artist_url = soup.find('div', class_='artwork-focus').find_all('div', class_='col-md-12 col-lg-6') try: artist_url = artist_url[1].find('h2').a['href'] if self_website_domain not in artist_url: artist_url = self_link_maker(artist_url) except AttributeError: artist_url = None # Artist_id artist_id = self.artist_id except AttributeError: # Comes here if the page is not returned by the website. visited.discard(url) self_listy.append(url)
def recurrent(i_url, depth): soup = TheMiner.fetch_page(i_url) if soup is not None: product_list = soup.find('div', class_='product-list-wrapper') product_list = product_list.find_all('div', class_='grid-item') for product in product_list: item_price = str(product.find('div', class_='grid-item-price').text).strip().upper() # Discard the data that does not have a price. if not item_price == "SOLD": product_link = str(product.a['href']).strip() self.artwork_listings.append(product_link) # Get artist data if depth is "1", if depth is more than "1" ignore this block. if depth == 1: # Calling the function to fetch the artist data, and return artist_id self.get_artist_data(soup, i_url) next_page = soup.find('div', class_='page-browser') if next_page is not None: next_page = next_page.find('div', class_='page-browser-next').a if next_page is not None: next_page = self.website.domain + str(next_page['href']) depth += 1 recurrent(next_page, depth)
def get_artwork_data_slave(self, url): # print("ARTWORK DATA SLAVE STARTS") soup = TheMiner.fetch_page(url) # print("A") if soup is not None: # print("ARTWORK DATA SLAVE GETS SOUP") # Field initiation :: artwork = None price = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None # Material to be added to technique technique = None seller_id = None artist = None medium = None # Medium must always have "Painting" or "Sculpture" (RULE :: 2) # print("A.1") # Seller_url seller_url = None seller_box = soup.find_all( 'div', re.compile( r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0\.*' )) for se in seller_box: if se.get('data-test') == 'aboutTheWorkPartner': try: seller_url = se.find('a')['href'] if self.website.domain not in seller_url: seller_url = self.link_maker(seller_url) except TypeError: seller_url = se.next.next.next.next.text # print(seller_url) # seller_id if seller_url is not None: seller_id = self.get_seller_id(seller_url) # artist url artist_url = None artist_box = soup.find_all( 'div', re.compile( r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0\.*' )) for ar in artist_box: if ar.get('data-test') == 'artistInfo': try: artist_url = ar.find('a')['href'] if self.website.domain not in artist_url: artist_url = self.link_maker(artist_url) except TypeError: pass # print(artist_url) artist_id = self.get_artist_id(artist_url) # print(f"Seller id {seller_id} \nArtist id {artist_id}") # except AttributeError: # pass # Medium try: medium = soup.find( 'dl', class_='Box-sc-15se88d-0 Flex-cw39ct-0 bKPevV' ).dd.text.strip() if "SCULPTURE" in str(medium).upper(): medium = "Sculpture" elif "PAINTING" in str(medium).upper(): medium = "Painting" else: medium = None except AttributeError: pass # print(f"Medium {medium}") # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None and medium is not None: try: price = soup.find_all( 'div', class_=re.compile( r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*')) for p in price: if p.get('data-test') == 'SaleMessage': price = p.text break temp = "" for i in price: if i == "-": break if i.isdigit(): temp += i if i == ".": temp += i price = float(temp) * rate # Price # print(price) except AttributeError: price = None except ValueError: price = None except TypeError: price = None # RULE : 5 if price is not None: # Find artist, artwork, year, type_, dimensions, support, frame, signature, authenticity, # about, image_loc(actual url of the image), and technique # Wish the code to break if either Artist's name or Artwork's name are not found. # Artist artist_name = soup.find_all( 'div', class_=re.compile(r'Box-sc-15se88d-0')) for a in artist_name: if a.get('data-test') == 'artworkSidebar': artist_ = a.find_all( 'div', class_=re.compile( r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*')) for a in artist_: if len(a.text.strip()) != 0: artist = a.text # print(artist) break break # print(artist) # Artwork artwork_block = soup.find('h1').text.split(",") artwork = artwork_block[0].strip() try: year = artwork_block[-1].strip() t = "" for y in year: if str(y) == "-": break if str(y).isnumeric(): t += y year = int(t) except ValueError: year = None # type(unique or what) try: type_ = soup.find( 'h1' ).nextSibling.nextSibling.nextSibling.text.strip() except AttributeError: pass # Dimensions try: dimensions = soup.find( 'h1').nextSibling.nextSibling.find_all('div') for dim in dimensions: if 'cm' in dim.text: dimensions = dim.text.strip() except AttributeError: pass # Technique try: technique = soup.find('h1').nextSibling.text.strip() # print(technique) except AttributeError: pass # Support, frame, sign, auth, about # frame, auth , sign try: bundle = soup.find_all( 'div', class_=re.compile( r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0 BorderBox-sc-18mwadn-0 StackableBorderBox-sc-1odyc7i-0\.*' )) for b in bundle: if b.get('data-test') == 'aboutTheWorkPartner': bud = b.nextSibling # print(bud.prettify()) break bundle = bud.find_all('dl') for dl in bundle: if dl.next.text.strip() == 'Signature': signature = dl.dd.text.strip() continue if dl.dt.text.strip( ) == 'Certificate of authenticity': authenticity = dl.dd.text.strip() continue if dl.dt.text.strip() == 'Frame': frame = dl.dd.text.strip() continue except AttributeError: pass try: about = soup.find( 'div', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 gPzDV' ).find('div', class_='ReadMore__Container-sc-1bqy0ya-0 guOJdN' ).p.text.strip().split(" ") t = "" for a in about: t += a.strip() t += " " about = t except AttributeError: about = None # Image location try: image_loc = soup.find_all('div', class_='Box-sc-15se88d-0') for loc in image_loc: if loc.get('data-test') == 'artworkImage': image_loc = loc.find('img').get('src') break except AttributeError: pass artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } # print(artwork_bundle) TheAuthour.write_artwork_price_image(**artwork_bundle) else: pass # print(f"Skipping {url}\n PRICE : {price}") else: pass # print(f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}") else: pass
def get_artwork_data_slave(self, url): soup = TheMiner.fetch_page(url) if soup is not None: # Field initiation :: Artwork_title, artist, price, seller_id :: (picked), # medium, type, dimension, frame, authenticity, about :: year, support, signature # artist_id, Image_loc = None seller_id = None artist = None artwork = None price = None # Material to be added to medium material = None medium = None # (painting or sculpture) type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None seller_url = str(soup.find('div', class_='product-artist').a.get('href')).strip() # We want the code to break if this entry is not found so that we can fix it. # THE PAGE MUST HAVE A SELLER. # Seller_id if seller_url is not None: if seller_url in SELLER_INFO: seller_id = SELLER_INFO.get(seller_url) print(seller_id) else: self.get_seller_data(seller_url) if seller_url in SELLER_INFO: seller_id = SELLER_INFO.get(seller_url) else: if seller_id is None: print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN SELLER_INFO. WE SHALL BREAK.") else: if seller_id is None: print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN SELLER_INFO. WE SHALL BREAK.") # Artist_id if seller_url is not None: if seller_url in ARTIST_INFO: artist_id = ARTIST_INFO.get(seller_url) print(artist_id) else: if artist_id is None: print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN ARTIST_INFO. WE SHALL BREAK.") else: # If it ever comes to here, the page will not have a Seller/Artist if artist_id is None: print("SIRE THE MATRIX HAS GLITCHED. ENTRY NOT IN ARTIST_INFO. WE SHALL BREAK.") A = soup.h1 B = A.find('div', class_='product-artist') artist = str(B.a.text).strip() # Artist print(artist) artwork = str(A.find('div', class_='product-name').text).strip() # Artwork print(artwork) price = str(soup.find('div', class_='product-price').find('div', class_='p-price-container').text).strip() temp = "" for i in price: if i.isdigit(): temp += i price = int(temp) # Price print(price) product_details_desc = soup.find('div', class_='product-details_desc') product_details = product_details_desc.find_all('div', class_='tech-item') for detail in product_details: label = str(detail.find('div', class_='tech-label').text).strip().upper() value = str(detail.find('div', class_='tech-value').text).strip() print(label) print(value) if label == 'TECHNIQUE': medium = value elif label == 'TYPE': type_ = value elif label == 'MATERIAL': # We don't need material. Adding material to medium?? material = value elif label == 'DIMENSIONS': dimensions = value elif label == 'FRAMING': frame = value elif label == 'QUALITY GUARANTEE': authenticity = value # if that is not here, it'll throw errors. # elif label == '' try: about = str(product_details_desc.find('div', class_='desc text-1').text).strip() except AttributeError: about = None # If material is None, we don't add it to medium. if material is not None: # If medium is None, we make it a string before adding material to it. if medium is None: medium = "" else: medium += " " medium += material # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None, # Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None, # platform=None, image_addr=None, seller_id=None, artist_id=None) artwork_bundle = {"artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id} self.write_artwork_data(**artwork_bundle)
def get_artwork_data_slave(self, url): soup = TheMiner.fetch_page(url, ghost=True) if soup is not None: # Field initiation :: artwork = None price = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None # Material to be added to technique technique = "" seller_id = None artist = None # Medium must always have "Painting" or "Sculpture" (RULE :: 2) if "/painting/" in str(url): medium = "Painting" # (painting or sculpture) elif "/sculpture/" in str(url): medium = "Sculpture" else: # So that url leaks don't break the code. medium = None # Seller_id seller_url = # We'll let it crash at seller_url not found because that is the way of the world. seller_id = self.get_seller_id(seller_url) # Artist_id artist_url = artist_id = self.get_artist_id(artist_url) # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None and medium is not None: try : price temp = "" for i in price: if i.isdigit(): temp += i if i == ".": temp += i price = float(temp) # Price # print(price) except AttributeError: price = None except ValueError: price = None # RULE : 5 if price is not None: # Find artist, artwork, year, type_, dimensions, support, frame, signature, authenticity, # about, image_loc(actual url of the image), and technique # Wish the code to break if either Artist's name or Artwork's name are not found. # Artist print(artist) # Artwork print(artwork) try: about = except AttributeError: about = None artwork_bundle = {"artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique} TheAuthour.write_artwork_price_image(**artwork_bundle) else : print(f"Skipping {url}\n PRICE : {price}") else: print(f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}")
def get_art_data_core(self, url): platform = self.website.platform artist_name = None artwork_title = None year = None price = None Dimensions = None Medium = None Type = None Support = None Frame = None Signature = None Authenticity = None About = None image_addr = None seller_id = None soup = TheMiner.fetch_page(url) if soup is not None: # Data to be picked here. # Artist's name, artwork's name, year, Artwork description, Price, Dimensions, Medium(Sculpture/Painting) # Type (Copies or Unique), Frame, Support, Authenticity, Website, Image (12) seller_id_trigger, seller_bundle = self.seller_info(soup) # Seller_id_trigger could be 0, 1 or a real id.(real id comes with bundle =None) # seller_id_trigger 0 comes with some data in bundle # seller_id 1_trigger comes with no data in the bundle if seller_bundle is None: seller_id = seller_id_trigger # THIS FOLLOW BLOCK OF CODE NEEDS TO BE CONSISTENT ACROSS ALL THE WEBSITE MODULES. # Get seller bundle elif seller_id_trigger == 0: seller_ds = SellerData(*seller_bundle) s_bundle = seller_ds.seller_bundle() # Write data to table "sellers" s_agent = db.Sellers() s_agent.create_table_sellers() seller_id = s_agent.insert_data_sellers(*s_bundle) # Writing the seller_info for quick use and reduce the number of clicks seller_name = seller_bundle[0] location = seller_bundle[1] SELLER_INFO["_".join([seller_name, location])] = seller_id else: seller_id = seller_id_trigger try: A = soup.find('section', id='informations') B = A.find('div', class_='relative') try: ## ARTIST'S NAME artist_name = B.find('span', class_='primary-title').text.strip() # print(artist_name) except: artist_name = None try: ## ARTWORK'S NAME C = B.find('span', class_='secondary-title').text.strip() artwork_ = C.split(',') artwork_title = "" for a in range(len(artwork_)-1): if a == 0: artwork_title = artwork_[a] continue artwork_title = artwork_title + ", " + artwork_[a].strip() # print(artwork_title) # ARTWORK YEAR year = C.split(',')[-1].strip() # print(year) except: artwork_title = None year = None try: # PRICE price = A.find('p', class_='media-price price').text.strip() number = '' for p in price: if p == '-': break if p.isdigit(): number += str(p) price = int(number) # print(price) except: price = None try: # Image url B = A.find('div', id='img-container') image_addr = B.find('img', id='img_original')['data-src'] # print(image_addr) except: image_addr = None except: artist_name = None artwork_title = None year = None price = None image_addr = None try: D = soup.find('div', id='tabs-description').ul # Contains:: image, dimensions, medium, type, Frame, Support, authenticity, signature E = D.find_all('li') Dimensions = None Medium = None Type = None Support = None Frame = None Signature = None Authenticity = None About = None for e in E: a = e.text # Dimensions if 'Dimensions' in a and 'About the artwork' not in a and 'Support' not in a: Dimensions = e.find('p', class_='pull-right').strong.text.strip() + ' (Height x Width x Depth)' dim = True # print(Dimensions) continue # Medium (Sculpture/Painting) if 'Medium' in a and 'About the artwork' not in a: Medium = e.find('p', class_='pull-right').a.text.strip() # print(Medium) continue # Type if 'Type' in a and 'About the artwork' not in a: Type = e.find('p', class_='pull-right text-right').text.strip().split(' ')[0] # print(Type) continue # Support (base) if 'Support' in a and 'About the artwork' not in a: try: f = e.find('p', class_='pull-right text-right').text.strip().split(' ') Support = f[0] + '. ' + f[1].strip('\n') f = e.find('p', class_='pull-right text-right').strong.text.strip().strip('\n') Support += f except IndexError: Support = e.find('p', class_='pull-right text-right').text.strip() # print(Support) continue # Framing if 'Framing' in a and 'About the artwork' not in a: Frame = e.find('p', class_='pull-right').text.strip() # print(Frame) continue # Signature if 'Signature' in a and 'About the artwork' not in a: Signature = e.find('p', class_='pull-right').text.strip() # print(Signature) continue # Authenticity if 'Authenticity' in a and 'About the artwork' not in a: Authenticity = e.find('p', class_='pull-right text-right').text.strip() # print(Authenticity) continue # Artwork Description if 'About the artwork' in a: About = e.find('p', class_="marg-bot-10") if About is not None: a = e.find('div', class_="description-catalog see-more text-justify").text.strip() About = About.text.strip() About += a else: About = e.find('p', class_='').text.strip() continue # print(About) except: # Make all the fields Null Dimensions = None Medium = None Type = None Support = None Frame = None Signature = None Authenticity = None About = None result = {"artwork_title": artwork_title, "artist_name": artist_name, "year": year, "price": price, "Dimensions": Dimensions, "Medium": Medium, "Type": Type, "Support": Support, "Frame": Frame, "Signature": Signature, "Authenticity": Authenticity, "About": About, 'platform': platform, "image_addr": image_addr, "seller_id": seller_id} artwork_item = ArtworkData(**result) # Downloading images will be done at the end, after every 100, or so instances, we'll write the # data from image pool to a db [ image_url and artwork_id ] # And download the entire pool of images at the end of the execution. # The function for downlaoding the images will have to pick a set of 100 images, the function # is with TheMiner in module dataStructures. That function is called by ArtworksData (in datastructures) # DON'T THREAD ANYTHING WITH DATA DOWNLOAD FUNCTION AS IT ITSELF IS LAUNCHED ON THREAD (STUPID). art_bund = artwork_item.artwork_bundle() # WRITING ARTWORK dbartwork_agent = db.Artwork() dbartwork_agent.create_table_artwork() artwork_id = dbartwork_agent.insert_data_artwork(*art_bund) # Writing image-info # image_addr = result[13] image_bundle = artwork_item.image_bundle(artwork_id) dbimage_agent = db.Images() dbimage_agent.create_table_images() # dbimage_agent.insert_data_images(image_addr, artwork_id) dbimage_agent.insert_data_images(*image_bundle) # Price bundle can only be created once the artwork is written in the db price_bund = artwork_item.price_bundle(artwork_id) # WRITING PRICES dbprice_agent = db.Price() dbprice_agent.create_table_prices() dbprice_agent.insert_data_prices(*price_bund)
def get_artwork_data_slave(self, url): soup = TheMiner.fetch_page(url) if soup is not None: # Field initiation :: Artwork_title, artist, price, seller_id :: (picked), # medium, type, dimension, frame, authenticity, about :: year, support, signature # artist_id, Image_loc = None seller_id = None artist = None artwork = None price = None # Material to be added to technique technique = "" # Medium must always have "Painting" or "Sculpture" (RULE :: 2) if "/painting/" in str(url): medium = "Painting" # (painting or sculpture) elif "/sculpture/" in str(url): medium = "Sculpture" else: # So that url leaks don't break the code. medium = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None seller_url = str( soup.find('div', class_='product-artist').a.get('href')).strip() # We want the code to break if this entry is not found so that we can fix it. # THE PAGE MUST HAVE A SELLER. # Seller_id seller_id = self.get_seller_id(seller_url) # Artist_id artist_url = seller_url artist_id = self.get_artist_id(artist_url) # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None and medium is not None: A = soup.h1 B = A.find('div', class_='product-artist') artist = str(B.a.text).strip() # Artist # print(artist) artwork = str(A.find('div', class_='product-name').text).strip() # Artwork # print(artwork) price = str( soup.find('div', class_='product-price').find( 'div', class_='p-price-container').text).strip() temp = "" for i in price: if i.isdigit(): temp += i if i == ".": temp += i price = float(temp) # Price # print(price) product_details_desc = soup.find('div', class_='product-details_desc') product_details = product_details_desc.find_all( 'div', class_='tech-item') for detail in product_details: label = str(detail.find( 'div', class_='tech-label').text).strip().upper() value = str(detail.find('div', class_='tech-value').text).strip() # print(label) # print(value) # For KAZoART, technique(info) goes under Medium, and Material(info) goes under Technique if label == 'TECHNIQUE': technique += " " technique += value technique.strip() elif label == 'TYPE': type_ = value elif label == 'MATERIAL': technique += " " technique = value technique.strip() elif label == 'DIMENSIONS': dimensions = value elif label == 'FRAMING': frame = value elif label == 'QUALITY GUARANTEE': authenticity = value # if that is not here, it'll throw errors. # elif label == '' try: about = str( product_details_desc.find( 'div', class_='desc text-1').text).strip() except AttributeError: about = None image_loc = soup.find('div', class_='product-left').find( 'div', class_='img-wrapper').img.get('src') # print(image_loc) # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None, # Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None, # platform=None, image_addr=None, seller_id=None, artist_id=None) artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } TheAuthour.write_artwork_price_image(**artwork_bundle) # self.write_artwork_data(**artwork_bundle) else: print( f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}" )
def get_artwork_data_slave(self, url): soup = TheMiner.fetch_page(url) if soup is not None: # Field initiation :: Artwork_title, artist, price, seller_id , # medium, type, dimension, frame, authenticity, about :: year, support, signature # artist_id, Image_loc = None seller_id = None artist = None artwork = None price = None medium = None # (painting or sculpture) technique = "" # Material and style type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None try: # PRICE A = soup.find('section', id='informations') price = str(A.find('p', class_='media-price price').text).strip() number = '' for p in price: if p == '-': break if p.isdigit(): number += str(p) if p == ".": number += str(p) price = float(number) # print(price) except AttributeError: pass except TypeError: pass # Rule : 5 if price is not None: # Seller_id try: seller_url = soup.find( 'div', id='top-seller').find('a').get('href') if 'galeries-d-art' in str(seller_url): seller_url = re.sub('galeries-d-art', 'art-galleries', seller_url) # If seller_url is found. seller_id = self.get_seller_id(seller_url) except AttributeError: # seller_id = None # There are pages where the seller has no other page. Then we make the url ourselves. seller_url = soup.find('div', id='top-seller').find( 'p', class_='highlight-title').text seller_url = str(seller_url).strip() if seller_url in SELLER_INFO: seller_id = SELLER_INFO[seller_url] else: location = soup.find('div', id='top-seller').find( 'p', class_='subtitle').text.strip().split(',') location = str(location[-1]).strip() seller_name = seller_url bundle = [ seller_url, self.website.platform, seller_name, location, None ] # We write the seller info directly and fetch the seller_id TheAuthour.write_seller(*bundle) seller_id = SELLER_INFO[seller_url] # Artist_id try: artist_url = soup.find('section', id='informations').find( 'div', class_='relative').a.get('href') if "oeuvres-d-art-contemporain" in artist_url: re.sub("oeuvres-d-art-contemporain", "contemporary-artworks", artist_url) artist_id = self.get_artist_id(artist_url) except AttributeError: artist_id = None print("\n\n\n\n\n") print(url) print("\n\n\n\n\n") time.sleep(50) # Medium must always have "Painting" or "Sculpture" (RULE :: 2) la = str(url).split('/') if 'painting' in la: medium = "Painting " # (painting or sculpture) elif 'sculpture' in la: medium = "Sculpture" else: # So that url leaks don't break the code. medium = None # IF either the seller id or artist_id are missing, escape the rest. (Rule : 3) # If medium is neither Paintings, not Sculptures. We don't fetch data. ( Rule : 2) if seller_id is not None and artist_id is not None and medium is not None: # ______________________________MAIN DATA FETCHING________________________ A = soup.find('section', id='informations') B = A.find('div', class_='relative') # ARTIST'S NAME artist = B.find('span', class_='primary-title').text.strip() # print(artist) # ARTWORK'S NAME C = B.find('span', class_='secondary-title').text.strip() artwork_ = C.split(',') artwork_title = "" for a in range(len(artwork_) - 1): if a == 0: artwork_title = artwork_[a] continue artwork_title = artwork_title + ", " + artwork_[ a].strip() artwork = artwork_title # print(artwork) try: # ARTWORK YEAR year = C.split(',')[-1].strip() # print(year) except IndexError: pass # year = None try: # Image url B = A.find('div', id='img-container') image_loc = B.find('img', id='img_original')['data-src'] # print(image_loc) except AttributeError: pass # Contains:: image, dimensions, medium, type, Frame, Support, authenticity, signature try: D = soup.find('div', id='tabs-description').ul E = D.find_all('li') for e in E: a = e.text # Dimensions if 'Dimensions' in a and 'About the artwork' not in a and 'Support' not in a: dimensions = e.find( 'p', class_='pull-right').strong.text.strip() dim = True # print(dimensions) continue # Medium (Sculpture/Painting) and Technique if 'Medium' in a and 'About the artwork' not in a: technique = e.find( 'p', class_='pull-right').text.split(" ") # print(technique) temp = "" for t in technique: if t != "": temp += t.strip() temp += " " # medium = medium[0] # technique = medium[1] technique = temp # print(technique) continue # Type if 'Type' in a and 'About the artwork' not in a: type_ = e.find('p', class_='pull-right text-right' ).text.strip().split(' ')[0] # print(type_) continue # Support (base) if 'Support' in a and 'About the artwork' not in a: try: f = e.find('p', class_='pull-right text-right' ).text.strip().split(' ') support = f[0] + '. ' + f[1].strip('\n') f = e.find( 'p', class_='pull-right text-right' ).strong.text.strip().strip('\n') support += f except IndexError: support = e.find( 'p', class_='pull-right text-right' ).text.strip() # print(support) continue # Framing if 'Framing' in a and 'About the artwork' not in a: frame = e.find( 'p', class_='pull-right').text.strip() # print(frame) continue # Signature if 'Signature' in a and 'About the artwork' not in a: signature = e.find( 'p', class_='pull-right').text.strip() # print(signature) continue # Authenticity if 'Authenticity' in a and 'About the artwork' not in a: authenticity = e.find( 'p', class_='pull-right text-right').text.strip( ) # print(authenticity) continue # Artwork Description if 'About the artwork' in a: about = e.find('p', class_="marg-bot-10") if about is not None: a = e.find( 'div', class_= "description-catalog see-more text-justify" ).text.strip() about = about.text.strip() about += a else: about = e.find('p', class_='').text.strip() continue # print(about) except AttributeError: pass # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None, # Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None, # platform=None, image_addr=None, seller_id=None, artist_id=None) artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } # print(artwork_bundle) TheAuthour.write_artwork_price_image(**artwork_bundle) else: print( f"SELLER ID :: {seller_id},\nARTIST ID :: {artist_id}") else: # If the price is not available, we skip the entire process. print(f"PRICE NOT FOUND : {price} at {url}") else: print(f"\n\n\n\n\nURL DIDN'T RETURN : {url}\n\n\n\n\n")