def get_artist_data(self, soup, url): # Called by self.get_artwork_listings_slave() # Pick name, born, country, about artist_resume = soup.find('div', class_='artist-resume').find( 'div', class_='artist-resume_text') name = artist_resume.h1.text.strip() print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: try: country = artist_resume.find( 'p', class_='location').text.strip().split('\n') country = country[0].split(',') country = country[-1].strip() print(country) except AttributeError: country = None about = soup.find('div', id='about').text.strip() # About will either be found and be some text or be None. # print(about) artist_data_pack = [name, None, country, about] # pack = [name, born, country, about] # self.write_artist_data(*artist_data_pack) KEY_INFO[url] = db.Artist.key_maker(artist_data_pack) TheAuthour.write_artist(*artist_data_pack)
def get_seller_id(self, seller_url) -> int: # Fetches seller_data, writes it in db, and returns seller_id. # bundle = [seller_url, self.website.platform, 'KAZoART', None, url] seller_id = None if seller_url is not None: if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) # print(seller_id) else: # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data # again with seller_url self.get_seller_data(seller_url) # wait for a second to make sure that transaction is smooth. Activate this line if errors are thrown. # time.sleep(1) # Try to fetch seller data again. if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: # Make a Kazoart style bundle, and write it to obtain a seller_id. bundle = [seller_url, self.website.platform, 'BAREBONES', None, seller_url] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: print("FATAL ERROR :: Seller_id not found.") else: print("FATAL ERROR :: Seller_id not found.") # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering # data beyond rule 3 check . return seller_id
def get_seller_data(self, url): # Caller :: get_artwork_data_slave visited.discard(url) soup = TheMiner.fetch_page(url) # print("A") if soup is not None: # print("B") A = soup.find('div', id='top-seller') seller_name = A.h1.text.strip() # print(seller_name) # Code will break if seller's name is not found try: location = A.find('p', class_="subtitle").text.strip().split(',') location = location[-1].strip() # print(location) except AttributeError: location = None try: website = str(soup.find('ul', id="websites").a['href']).strip() # print(website) except AttributeError: website = None except TypeError: website = None bundle = [ url, self.website.platform, seller_name, location, website ] # print(bundle) TheAuthour.write_seller(*bundle)
def get_seller_id(self, seller_url) -> int: # Fetches seller_data, writes it in db, and returns seller_id. # bundle = [seller_url, self.website.platform, 'KAZoART', None, url] seller_id = None if seller_url is not None: if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) # print(seller_id) else: # Process and create the bundle here. bundle = [ seller_url, self.website.platform, 'KAZoART', None, seller_url ] # Writing to db. TheAuthour.write_seller(*bundle) if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: print("FATAL ERROR :: Seller_id not found.") else: print("FATAL ERROR :: Seller_id not found.") # Let's return seller_id, even if it's None. return seller_id
def get_artist_data(self, soup, url): # Called by self.get_artwork_listings_slave() # Pick name, born, country, about # Name : Pick artist's name here print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: try: # Pick artist's country here. print(country) except AttributeError: country = None try: # Pick birth year here here. print(born) except AttributeError: born = None try: # Pick artist's description here. print(about) except AttributeError: about = None artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[url] = db.Artist.key_maker(artist_data_pack) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack)
def get_seller_data(self, url): # Caller :: get_artwork_data_slave and get_seller_id visited.discard(url) soup = TheMiner.fetch_page(url) if soup is not None: # Seller's Name print(seller_name) # Code will break if seller's name is not found # Location try: print(location) except AttributeError: location = None # Website try: print(website) except AttributeError: website = None except TypeError: website = None bundle = [url, self.website.platform, seller_name, location, website] print(bundle) TheAuthour.write_seller(*bundle)
def key_maker(artist_url): options = Options() options.headless = True driver = webdriver.Firefox(options=options) visited.discard(artist_url) soup = BeautifulSoup(driver.page_source, artist_url) if soup is not None: n_c = soup.find_all('h2', class_='font_2') # Artist's name try: name = n_c[0].text.strip() except IndexError: print(n_c) name = None # print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: # Country try: country = n_c[1].text.strip() except AttributeError: country = None # About try: text = soup.find_all('p', class_='font_8') about = "" for t in text: about += t.text.strip() about += " " # print(about) except AttributeError: about = None except TypeError: about = None # About will either be found and be some text or be None. # print(about) artist_data_pack = [name, None, country, about] # artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[artist_url] = db.Artist.key_maker(artist_data_pack) key = KEY_INFO.get(artist_url) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack) # key = db.Artist.key_maker(artist_data_pack) # pack = [name, born, country, about] driver.quit() return key else: driver.quit() return None else: return None
def get_artist_data(self, soup, url): # Called by self.get_artwork_listings_slave() # Pick name, born, country, about # PICKING ARTIST DATA A = soup.find('div', id='biography') # Artist's name name = A.h1.text.strip() # print(name) # Code should break if the name goes missing try: # Born A = soup.find('div', id='biography') B = A.find('div', class_='sub-title col-sm-9 col-xs-12') bo = B.find('span', class_='birthday-date').text born = "" for b in bo: if b.isdigit(): born += b born = int(born) # print(born) except AttributeError: born = None try: # Country A = soup.find('div', id='biography') B = A.find('div', class_='sub-title col-sm-9 col-xs-12') country = B.span.text.strip() # print(country) except AttributeError: country = None try: # About A = soup.find('div', id='biography') about = A.find('div', class_='col-sm-9 col-xs-12 biography').text.strip() ab = about.split(" ") about = '' for a in range(len(ab) - 1): b = ab[a] about = about + "\n" + b.strip() about = about.strip() # print(about) except AttributeError: about = None artist_data_pack = [name, born, country, about] KEY_INFO[url] = db.Artist.key_maker(artist_data_pack) TheAuthour.write_artist(*artist_data_pack)
def get_artist_data(self, soup, url): # name, born, country, about # pack = [name, born, country, about] # no need to run the safety try: except: here because we're not fetching the page here. try: name = soup.find('div', class_='artist-intro').find('h1').text name = str(name).strip() except AttributeError: name = None if name is not None: try: born = soup.find('p', class_='born').text.strip() t = "" for b in born: if str(b).isdigit(): t += b born = int(t) if born > 3000: born = str(born)[0:3] except AttributeError: born = None except ValueError: born = None # Country try: country = soup.find('div', class_="artist-intro") country = country.find('div', class_='h2').text.strip().split("|") country = str(country[-1]).strip() except AttributeError: country = None # About try: about = soup.find('section', class_='artist-bio') about = about.find('div', class_='resume').text.strip() except AttributeError: about = None # pack = [name, born, country, about] # print(pack) artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[url] = db.Artist.key_maker(artist_data_pack) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack)
def get_seller_id(self, seller_url) -> int: # Fetches seller_data, writes it in db, and returns seller_id. # bundle = [seller_url, self.website.platform, 'KAZoART', None, url] # print("GET SELLER ID") seller_id = None if seller_url is not None: if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) return seller_id # print(seller_id) else: # If code reaches here then the entry for seller doesn't already exists. Let's call get_seller_data # with seller_url self.get_seller_data(seller_url) # Try to fetch seller data again. if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) # If it is not a url, get_seller_data will fail to make an entry.In that case we move to the next part. else: # Make a Kazoart style bundle, and write it to obtain a seller_id. # bundle = [seller_url, platform, Seller's name, location, website] bundle = [ seller_url, self.website.platform, seller_url, None, seller_url ] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) if seller_url in SELLER_INFO.keys(): # This will always run, unless the program is failing unexpectedly. seller_id = SELLER_INFO.get(seller_url) else: print("FATAL ERROR :: Seller_id not found.") else: print("FATAL ERROR :: Seller_id not found.") # Let's return seller_id, even if it's None. This will stop the get_artwork_Data_slave from gathering # data beyond rule 3 check . return seller_id
def get_artwork_data_slave(self, url): soup = TheMiner.fetch_page(url) if soup is not None: # Field initiation :: Artwork_title, artist, price, seller_id , # medium, type, dimension, frame, authenticity, about :: year, support, signature # artist_id, Image_loc = None seller_id = None artist = None artwork = None price = None medium = None # (painting or sculpture) technique = "" # Material and style type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None try: # PRICE A = soup.find('section', id='informations') price = str(A.find('p', class_='media-price price').text).strip() number = '' for p in price: if p == '-': break if p.isdigit(): number += str(p) if p == ".": number += str(p) price = float(number) # print(price) except AttributeError: pass except TypeError: pass # Rule : 5 if price is not None: # Seller_id try: seller_url = soup.find( 'div', id='top-seller').find('a').get('href') if 'galeries-d-art' in str(seller_url): seller_url = re.sub('galeries-d-art', 'art-galleries', seller_url) # If seller_url is found. seller_id = self.get_seller_id(seller_url) except AttributeError: # seller_id = None # There are pages where the seller has no other page. Then we make the url ourselves. seller_url = soup.find('div', id='top-seller').find( 'p', class_='highlight-title').text seller_url = str(seller_url).strip() if seller_url in SELLER_INFO: seller_id = SELLER_INFO[seller_url] else: location = soup.find('div', id='top-seller').find( 'p', class_='subtitle').text.strip().split(',') location = str(location[-1]).strip() seller_name = seller_url bundle = [ seller_url, self.website.platform, seller_name, location, None ] # We write the seller info directly and fetch the seller_id TheAuthour.write_seller(*bundle) seller_id = SELLER_INFO[seller_url] # Artist_id try: artist_url = soup.find('section', id='informations').find( 'div', class_='relative').a.get('href') if "oeuvres-d-art-contemporain" in artist_url: re.sub("oeuvres-d-art-contemporain", "contemporary-artworks", artist_url) artist_id = self.get_artist_id(artist_url) except AttributeError: artist_id = None print("\n\n\n\n\n") print(url) print("\n\n\n\n\n") time.sleep(50) # Medium must always have "Painting" or "Sculpture" (RULE :: 2) la = str(url).split('/') if 'painting' in la: medium = "Painting " # (painting or sculpture) elif 'sculpture' in la: medium = "Sculpture" else: # So that url leaks don't break the code. medium = None # IF either the seller id or artist_id are missing, escape the rest. (Rule : 3) # If medium is neither Paintings, not Sculptures. We don't fetch data. ( Rule : 2) if seller_id is not None and artist_id is not None and medium is not None: # ______________________________MAIN DATA FETCHING________________________ A = soup.find('section', id='informations') B = A.find('div', class_='relative') # ARTIST'S NAME artist = B.find('span', class_='primary-title').text.strip() # print(artist) # ARTWORK'S NAME C = B.find('span', class_='secondary-title').text.strip() artwork_ = C.split(',') artwork_title = "" for a in range(len(artwork_) - 1): if a == 0: artwork_title = artwork_[a] continue artwork_title = artwork_title + ", " + artwork_[ a].strip() artwork = artwork_title # print(artwork) try: # ARTWORK YEAR year = C.split(',')[-1].strip() # print(year) except IndexError: pass # year = None try: # Image url B = A.find('div', id='img-container') image_loc = B.find('img', id='img_original')['data-src'] # print(image_loc) except AttributeError: pass # Contains:: image, dimensions, medium, type, Frame, Support, authenticity, signature try: D = soup.find('div', id='tabs-description').ul E = D.find_all('li') for e in E: a = e.text # Dimensions if 'Dimensions' in a and 'About the artwork' not in a and 'Support' not in a: dimensions = e.find( 'p', class_='pull-right').strong.text.strip() dim = True # print(dimensions) continue # Medium (Sculpture/Painting) and Technique if 'Medium' in a and 'About the artwork' not in a: technique = e.find( 'p', class_='pull-right').text.split(" ") # print(technique) temp = "" for t in technique: if t != "": temp += t.strip() temp += " " # medium = medium[0] # technique = medium[1] technique = temp # print(technique) continue # Type if 'Type' in a and 'About the artwork' not in a: type_ = e.find('p', class_='pull-right text-right' ).text.strip().split(' ')[0] # print(type_) continue # Support (base) if 'Support' in a and 'About the artwork' not in a: try: f = e.find('p', class_='pull-right text-right' ).text.strip().split(' ') support = f[0] + '. ' + f[1].strip('\n') f = e.find( 'p', class_='pull-right text-right' ).strong.text.strip().strip('\n') support += f except IndexError: support = e.find( 'p', class_='pull-right text-right' ).text.strip() # print(support) continue # Framing if 'Framing' in a and 'About the artwork' not in a: frame = e.find( 'p', class_='pull-right').text.strip() # print(frame) continue # Signature if 'Signature' in a and 'About the artwork' not in a: signature = e.find( 'p', class_='pull-right').text.strip() # print(signature) continue # Authenticity if 'Authenticity' in a and 'About the artwork' not in a: authenticity = e.find( 'p', class_='pull-right text-right').text.strip( ) # print(authenticity) continue # Artwork Description if 'About the artwork' in a: about = e.find('p', class_="marg-bot-10") if about is not None: a = e.find( 'div', class_= "description-catalog see-more text-justify" ).text.strip() about = about.text.strip() about += a else: about = e.find('p', class_='').text.strip() continue # print(about) except AttributeError: pass # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None, # Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None, # platform=None, image_addr=None, seller_id=None, artist_id=None) artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } # print(artwork_bundle) TheAuthour.write_artwork_price_image(**artwork_bundle) else: print( f"SELLER ID :: {seller_id},\nARTIST ID :: {artist_id}") else: # If the price is not available, we skip the entire process. print(f"PRICE NOT FOUND : {price} at {url}") else: print(f"\n\n\n\n\nURL DIDN'T RETURN : {url}\n\n\n\n\n")
def get_artwork_data_slave(self, url): # print("ARTWORK DATA SLAVE STARTS") soup = TheMiner.fetch_page(url) # print("A") if soup is not None: # print("ARTWORK DATA SLAVE GETS SOUP") # Field initiation :: artwork = None price = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None # Material to be added to technique technique = None seller_id = None artist = None medium = None # Medium must always have "Painting" or "Sculpture" (RULE :: 2) # print("A.1") # Seller_url seller_url = None seller_box = soup.find_all( 'div', re.compile( r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0\.*' )) for se in seller_box: if se.get('data-test') == 'aboutTheWorkPartner': try: seller_url = se.find('a')['href'] if self.website.domain not in seller_url: seller_url = self.link_maker(seller_url) except TypeError: seller_url = se.next.next.next.next.text # print(seller_url) # seller_id if seller_url is not None: seller_id = self.get_seller_id(seller_url) # artist url artist_url = None artist_box = soup.find_all( 'div', re.compile( r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0\.*' )) for ar in artist_box: if ar.get('data-test') == 'artistInfo': try: artist_url = ar.find('a')['href'] if self.website.domain not in artist_url: artist_url = self.link_maker(artist_url) except TypeError: pass # print(artist_url) artist_id = self.get_artist_id(artist_url) # print(f"Seller id {seller_id} \nArtist id {artist_id}") # except AttributeError: # pass # Medium try: medium = soup.find( 'dl', class_='Box-sc-15se88d-0 Flex-cw39ct-0 bKPevV' ).dd.text.strip() if "SCULPTURE" in str(medium).upper(): medium = "Sculpture" elif "PAINTING" in str(medium).upper(): medium = "Painting" else: medium = None except AttributeError: pass # print(f"Medium {medium}") # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None and medium is not None: try: price = soup.find_all( 'div', class_=re.compile( r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*')) for p in price: if p.get('data-test') == 'SaleMessage': price = p.text break temp = "" for i in price: if i == "-": break if i.isdigit(): temp += i if i == ".": temp += i price = float(temp) * rate # Price # print(price) except AttributeError: price = None except ValueError: price = None except TypeError: price = None # RULE : 5 if price is not None: # Find artist, artwork, year, type_, dimensions, support, frame, signature, authenticity, # about, image_loc(actual url of the image), and technique # Wish the code to break if either Artist's name or Artwork's name are not found. # Artist artist_name = soup.find_all( 'div', class_=re.compile(r'Box-sc-15se88d-0')) for a in artist_name: if a.get('data-test') == 'artworkSidebar': artist_ = a.find_all( 'div', class_=re.compile( r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*')) for a in artist_: if len(a.text.strip()) != 0: artist = a.text # print(artist) break break # print(artist) # Artwork artwork_block = soup.find('h1').text.split(",") artwork = artwork_block[0].strip() try: year = artwork_block[-1].strip() t = "" for y in year: if str(y) == "-": break if str(y).isnumeric(): t += y year = int(t) except ValueError: year = None # type(unique or what) try: type_ = soup.find( 'h1' ).nextSibling.nextSibling.nextSibling.text.strip() except AttributeError: pass # Dimensions try: dimensions = soup.find( 'h1').nextSibling.nextSibling.find_all('div') for dim in dimensions: if 'cm' in dim.text: dimensions = dim.text.strip() except AttributeError: pass # Technique try: technique = soup.find('h1').nextSibling.text.strip() # print(technique) except AttributeError: pass # Support, frame, sign, auth, about # frame, auth , sign try: bundle = soup.find_all( 'div', class_=re.compile( r'Box-sc-15se88d-0 Flex-cw39ct-0 BorderBoxBase-sc-1072ama-0 BorderBox-sc-18mwadn-0 StackableBorderBox-sc-1odyc7i-0\.*' )) for b in bundle: if b.get('data-test') == 'aboutTheWorkPartner': bud = b.nextSibling # print(bud.prettify()) break bundle = bud.find_all('dl') for dl in bundle: if dl.next.text.strip() == 'Signature': signature = dl.dd.text.strip() continue if dl.dt.text.strip( ) == 'Certificate of authenticity': authenticity = dl.dd.text.strip() continue if dl.dt.text.strip() == 'Frame': frame = dl.dd.text.strip() continue except AttributeError: pass try: about = soup.find( 'div', class_='Box-sc-15se88d-0 Text-sc-18gcpao-0 gPzDV' ).find('div', class_='ReadMore__Container-sc-1bqy0ya-0 guOJdN' ).p.text.strip().split(" ") t = "" for a in about: t += a.strip() t += " " about = t except AttributeError: about = None # Image location try: image_loc = soup.find_all('div', class_='Box-sc-15se88d-0') for loc in image_loc: if loc.get('data-test') == 'artworkImage': image_loc = loc.find('img').get('src') break except AttributeError: pass artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } # print(artwork_bundle) TheAuthour.write_artwork_price_image(**artwork_bundle) else: pass # print(f"Skipping {url}\n PRICE : {price}") else: pass # print(f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}") else: pass
def get_seller_data(self, url): # Caller :: get_artwork_data_slave and get_seller_id # We get to here only after we do not find the seller's info in SELLER_INFO # print("GET SELLER DATA") visited.discard(url) soup = TheMiner.fetch_page(url) if soup is not None: # print("GET SELLER DATA: SOUP RETURNED") seller_name = None try: # Seller's Name seller_box = soup.find('div', id='jumpto--PartnerHeader') seller_name = seller_box.h1.text.strip() # print(seller_name) # Code will break if seller's name is not found except AttributeError: pass if seller_name is not None: # print(f"SELLER NAME : {seller_name}") # Location try: # Location is not available here. location = "" locatio = seller_box.h1.nextSibling() # print(type(locatio)) try: location = locatio.text except AttributeError: for l in locatio: location += l.text location += " " # print(location) except AttributeError: location = None except TypeError: location = None # Website try: website = soup.find_all('a') for web in website: if "http" in str(web.get('href')): website = web.get('href') print(web.get('href')) break # print(website) except AttributeError: website = None except IndexError: website = None bundle = [ url, self.website.platform, seller_name, location, website ] # print(bundle) TheAuthour.write_seller(*bundle)
def get_artist_data(self, soup, url): # Called by self.get_artwork_listings_slave() # Pick name, born, country, about # dom = etree.HTML(str(soup)) # Name : Pick artist's name here A = soup.find_all( 'div', class_=re.compile( r'Box-sc-15se88d-0 GridColumns__Cell-sc-1g9p6xx-1\.*')) name = soup.find('h1').text.strip() # print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: try: # Pick artist's country here. B = A[1].find('h2').text.strip().split(",") country = B[0].strip() if country == "American": country = "USA" elif country == "Japanese": country = "Japan" elif "French" in country: country = "France" elif "Argentine" in country: country = "Argentina" elif "Dutch" in country: country = "Netherlands" elif "Indian" in country: country = "India" elif "Pakistani" in country: country = "Pakistan" elif "Italian" in country: country = "Italy" elif "English" in country: country = "UK" elif "Chinese" in country: country = "China" elif "Hispanic" in country: country = "Spain" elif "German" in country: country = "Germany" elif "Spanish" in country: country = "Spain" elif "Russian" in country: country = "Russia" elif "British" in country: country = "UK" elif "Mexican" in country: country = "Mexico" elif "Brazilian" in country: country = "Brazil" elif "Canadian" in country: country = "Canada" elif "Belgian" in country: country = "Belgium" elif "Israeli" in country: country = "Israel" elif "Venezuelan" in country: country = "Venezuela" elif "Polish" in country: country = "Poland" else: for i in country: if str(i).isnumeric(): country = None # print(country) try: born = str(B[-1]).strip().split("–") born = born[0] t = "" for b in born: if b.isnumeric(): t += b born = int(t) except ValueError: born = None # print(born) except AttributeError: born = None country = None try: about = None # Pick artist's description here. about_block = soup.find_all( 'div', class_=re.compile( r'Box-sc-15se88d-0 Text-sc-18gcpao-0\.*')) for a in about_block: if a.text.strip() == 'Bio': # print("A") about = a.nextSibling.text.strip() break # print(about) except AttributeError: about = None artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[url] = db.Artist.key_maker(artist_data_pack) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack)
def get_artwork_data_slave(self, url): soup = TheMiner.fetch_page(url) if soup is not None: # Field initiation :: Artwork_title, artist, price, seller_id :: (picked), # medium, type, dimension, frame, authenticity, about :: year, support, signature # artist_id, Image_loc = None seller_id = None artist = None artwork = None price = None # Material to be added to technique technique = "" # Medium must always have "Painting" or "Sculpture" (RULE :: 2) if "/painting/" in str(url): medium = "Painting" # (painting or sculpture) elif "/sculpture/" in str(url): medium = "Sculpture" else: # So that url leaks don't break the code. medium = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None seller_url = str( soup.find('div', class_='product-artist').a.get('href')).strip() # We want the code to break if this entry is not found so that we can fix it. # THE PAGE MUST HAVE A SELLER. # Seller_id seller_id = self.get_seller_id(seller_url) # Artist_id artist_url = seller_url artist_id = self.get_artist_id(artist_url) # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None and medium is not None: A = soup.h1 B = A.find('div', class_='product-artist') artist = str(B.a.text).strip() # Artist # print(artist) artwork = str(A.find('div', class_='product-name').text).strip() # Artwork # print(artwork) price = str( soup.find('div', class_='product-price').find( 'div', class_='p-price-container').text).strip() temp = "" for i in price: if i.isdigit(): temp += i if i == ".": temp += i price = float(temp) # Price # print(price) product_details_desc = soup.find('div', class_='product-details_desc') product_details = product_details_desc.find_all( 'div', class_='tech-item') for detail in product_details: label = str(detail.find( 'div', class_='tech-label').text).strip().upper() value = str(detail.find('div', class_='tech-value').text).strip() # print(label) # print(value) # For KAZoART, technique(info) goes under Medium, and Material(info) goes under Technique if label == 'TECHNIQUE': technique += " " technique += value technique.strip() elif label == 'TYPE': type_ = value elif label == 'MATERIAL': technique += " " technique = value technique.strip() elif label == 'DIMENSIONS': dimensions = value elif label == 'FRAMING': frame = value elif label == 'QUALITY GUARANTEE': authenticity = value # if that is not here, it'll throw errors. # elif label == '' try: about = str( product_details_desc.find( 'div', class_='desc text-1').text).strip() except AttributeError: about = None image_loc = soup.find('div', class_='product-left').find( 'div', class_='img-wrapper').img.get('src') # print(image_loc) # self, artwork_title=None, artist_name=None, year=None, price=None, Dimensions=None, Medium=None, # Type=None, Support=None, Frame=None, Signature=None, Authenticity=None, About=None, # platform=None, image_addr=None, seller_id=None, artist_id=None) artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } TheAuthour.write_artwork_price_image(**artwork_bundle) # self.write_artwork_data(**artwork_bundle) else: print( f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}" )
def get_artwork_data_slave(self, url): soup = TheMiner.fetch_page(url, ghost=True) if soup is not None: # Field initiation :: artwork = None price = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None # Material to be added to technique technique = "" seller_id = None artist = None # Medium must always have "Painting" or "Sculpture" (RULE :: 2) if "/painting/" in str(url): medium = "Painting" # (painting or sculpture) elif "/sculpture/" in str(url): medium = "Sculpture" else: # So that url leaks don't break the code. medium = None # Seller_id seller_url = # We'll let it crash at seller_url not found because that is the way of the world. seller_id = self.get_seller_id(seller_url) # Artist_id artist_url = artist_id = self.get_artist_id(artist_url) # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None and medium is not None: try : price temp = "" for i in price: if i.isdigit(): temp += i if i == ".": temp += i price = float(temp) # Price # print(price) except AttributeError: price = None except ValueError: price = None # RULE : 5 if price is not None: # Find artist, artwork, year, type_, dimensions, support, frame, signature, authenticity, # about, image_loc(actual url of the image), and technique # Wish the code to break if either Artist's name or Artwork's name are not found. # Artist print(artist) # Artwork print(artwork) try: about = except AttributeError: about = None artwork_bundle = {"artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique} TheAuthour.write_artwork_price_image(**artwork_bundle) else : print(f"Skipping {url}\n PRICE : {price}") else: print(f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}")
def get_artwork_data_slave(self, url, driver): driver.get(url) soup = BeautifulSoup(driver.page_source, url) if soup is not None: # Field initiation :: artwork = None price = None type_ = None dimensions = None frame = None authenticity = None about = None artist_id = None image_loc = None year = None support = None signature = None # Material to be added to technique technique = "" seller_id = None artist = None medium = None # Medium must always have "Painting" or "Sculpture" (RULE :: 2) # if "/painting/" in str(url): # medium = "Painting" # (painting or sculpture) # elif "/sculpture/" in str(url): # medium = "Sculpture" # else: # # So that url leaks don't break the code. # medium = None # Seller_id try: seller_url = soup.find('div', class_='WncCi').find('a')['href'] seller_id = self.get_seller_id(seller_url) except AttributeError or TypeError: # Seller doesn't have a page. try: seller_url = soup.find('div', class_='WncCi').text.strip() if seller_url in SELLER_INFO.keys(): seller_id = SELLER_INFO.get(seller_url) else: # Make a Kazoart style bundle, and write it to obtain a seller_id. # [seller_url, platform_id(from name), Seller's name, Location, website] bundle = [ seller_url, self.website.platform, 'EMERGINGARTISTPLATFOM', None, None ] # Writing to db. TheAuthour.write_seller(*bundle) # This should generate the seller_id we so desperately desire. # time.sleep(1) seller_id = SELLER_INFO.get(seller_url) except AttributeError: pass # We'll let the seller name be seller_url if the url is not found. # Artist_id try: artist_url = soup.find('div', class_='WncCi').a.get('href') if str(artist_url).endswith(".com"): artist_url = re.sub('.com', "", artist_url) artist_url = re.sub('emergingartistplatform', 'emergingartistplatform.com', artist_url) artist_id = self.get_artist_id(artist_url) except AttributeError: try: artist_url = soup.find('div', class_='WncCi').text.strip() country = None a = soup.find_all('pre') for b in a: if b.get('data-hook') == 'description': p = b.find_all('p') for j in p: if 'Country' in j.text or 'country' in j.text or 'COUNTRY' in j.text: title = j.text.split(":") country = title[-1].strip() artist_data_pack = [artist_url, None, country, None] # artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[artist_url] = db.Artist.key_maker( artist_data_pack) key = KEY_INFO.get(artist_url) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack) artist_id = ARTIST_INFO[key] except AttributeError: artist_id = None # Continue fetching data only if seller_id, artist_id and medium are found. (RULE :: 3, 4) if seller_id is not None and artist_id is not None: try: a = soup.find_all('span') t = "" for b in a: if b.get('data-hook') == "formatted-primary-price": # print(b.text) for p in b.text: if str(p).isnumeric() or str(p) == ".": t += p price = float(t) * rate # print(price) # Price # print(price) except AttributeError: price = None except ValueError: price = None # RULE : 5 if price is not None: # Find artist, artwork, year, type_(N/A), dimensions, support, frame, signature, authenticity, # about, image_loc(actual url of the image), and technique # Wish the code to break if either Artist's name or Artwork's name are not found. # Artist artist = soup.find('div', class_='WncCi').text.strip() # print(artist) # Artwork a = soup.find_all('pre') for b in a: if b.get('data-hook') == 'description': p = b.find_all('p') for j in p: if 'Title' in j.text or 'title' in j.text or 'TITLE' in j.text: title = j.text.split(":") artwork = title[-1].strip() if len(artwork) >= 255: artwork = artwork[0:255] # print(artwork) if 'Date' in j.text: date = j.text.split(":") year = date[-1].strip() # print(year) if 'Size' in j.text: dimensions = j.text.split(":") dimensions = dimensions[-1].strip() # print(dimensions) if 'Medium' in j.text: technique = j.text.split(":") technique = technique[-1].strip() # print(technique) if len(j.text.split( ":")) == 1 and about is None: about = j.text[-1].strip() # Medium (RULE : 3) if "Sculptures" in self.website.start_url: medium = "Sculpture" else: medium = "Painting" # image_loc image = soup.find('div', class_='main-media-image-wrapper-hook') image = image.find('div', id='get-image-item-id') image_loc = image.get('href') # print(image_loc) artwork_bundle = { "artwork_title": artwork, "artist_name": artist, "year": year, "price": price, "Medium": medium, "Type": type_, "Dimensions": dimensions, "Support": support, "Frame": frame, "Signature": signature, "Authenticity": authenticity, "About": about, "platform": self.website.platform, "image_addr": image_loc, "seller_id": seller_id, "artist_id": artist_id, "url": url, "technique": technique } TheAuthour.write_artwork_price_image(**artwork_bundle) else: print(f"Skipping {url}\n PRICE : {price}") else: print( f"Skipping : {url}\nSeller_id = {seller_id}, Artist_id = {artist_id}, medium = {medium}" ) else: print(f"Soup not returned for {url}")