def get_seller_data(self, url): # Caller :: get_artwork_data_slave and get_seller_id visited.discard(url) soup = TheMiner.fetch_page(url) if soup is not None: # Seller's Name print(seller_name) # Code will break if seller's name is not found # Location try: print(location) except AttributeError: location = None # Website try: print(website) except AttributeError: website = None except TypeError: website = None bundle = [url, self.website.platform, seller_name, location, website] print(bundle) TheAuthour.write_seller(*bundle)
def recurr(url): soup = TheMiner.fetch_page(url, ghost=True) if soup is not None: # Because singulart keeps blocking ips, we'll ship everything inside try-except statements. try: # artist_blocks = soup.find_all('div', class_='artist-container') artist_blocks = soup.find_all('figure', class_='pic-artist') print(len(artist_blocks)) for artist in artist_blocks: link = artist.figcaption.h2.a.get('href') if self.website.domain not in link: link = self.link_maker(list) self.artist_listings.append(link) # print(self_artist_listings) # next pages next_pages = soup.find('div', class_='pagerfanta').find('nav') next_pages = next_pages.find_all('a') for next_ in next_pages: link = next_.get('href') if self.website.domain not in link: link = self.link_maker(link) if link not in self.listy: self.listy.append(link) # print(listy) # print(len(listy)) with concurrent.futures.ThreadPoolExecutor() as executor: trig = executor.map(recurr, self.listy) for trigger in trig: pass except AttributeError: visited.discard(url) pass
def get_seller_data(self, url): # Caller :: get_artwork_data_slave visited.discard(url) soup = TheMiner.fetch_page(url) # print("A") if soup is not None: # print("B") A = soup.find('div', id='top-seller') seller_name = A.h1.text.strip() # print(seller_name) # Code will break if seller's name is not found try: location = A.find('p', class_="subtitle").text.strip().split(',') location = location[-1].strip() # print(location) except AttributeError: location = None try: website = str(soup.find('ul', id="websites").a['href']).strip() # print(website) except AttributeError: website = None except TypeError: website = None bundle = [ url, self.website.platform, seller_name, location, website ] # print(bundle) TheAuthour.write_seller(*bundle)
def key_maker(artist_url): visited.discard(artist_url) soup = TheMiner.fetch_page(artist_url) if soup is not None: artist_resume = soup.find('div', class_='artist-resume').find('div', class_='artist-resume_text') name = artist_resume.h1.text.strip() print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: try: country = artist_resume.find('p', class_='location').text.strip().split('\n') country = country[0].split(',') country = country[-1].strip() print(country) except AttributeError: country = None about = soup.find('div', id='about').text.strip() # About will either be found and be some text or be None. # print(about) artist_data_pack = [name, None, country, about] key = db.Artist.key_maker(artist_data_pack) # pack = [name, born, country, about] return key else: return None
def miner(self): # Miner's track : We land on artwork listings page. We pick the listings from there. # We pick the Sellers and Artists from artwork pages. # From artwork pages we fetch the artwork for sale for artists listed. self.get_artist_listings() # print(kazoart.artist_listings) # print(self.artist_listings) # print("ARTIST LISTINGS") # print(len(self.artist_listings)) # time.sleep(10) # That the pages where we discarded the links can be visited as well for link in self.first_prod_list: visited.discard(link) self.get_artwork_listings_master() # get_artwork_listings_master -> get_artwork_listings_slave -> get_artist_data -> write_artist_data # So we're done with artist data. # print(f"ARTWORK LISTINGS, {len(self.artwork_listings)}") # print(len(self.artwork_listings)) # time.sleep(10) self.get_artwork_data_master() # DATA COLLECTION COMPLETED FOR THIS MODULE. # DOWNLOADING IMAGES NOW. TheMiner.sir_image_manager()
def key_maker(artist_url): options = Options() options.headless = True driver = webdriver.Firefox(options=options) visited.discard(artist_url) soup = BeautifulSoup(driver.page_source, artist_url) if soup is not None: n_c = soup.find_all('h2', class_='font_2') # Artist's name try: name = n_c[0].text.strip() except IndexError: print(n_c) name = None # print(name) # If an error occurs here, its because the page layout has changed and thus the code needs to be fixed if name is not None: # Country try: country = n_c[1].text.strip() except AttributeError: country = None # About try: text = soup.find_all('p', class_='font_8') about = "" for t in text: about += t.text.strip() about += " " # print(about) except AttributeError: about = None except TypeError: about = None # About will either be found and be some text or be None. # print(about) artist_data_pack = [name, None, country, about] # artist_data_pack = [name, born, country, about] # pack = [name, born, country, about] # Updating KEY_INFO dictionary. KEY_INFO[artist_url] = db.Artist.key_maker(artist_data_pack) key = KEY_INFO.get(artist_url) # Updating the dB with artist listings. TheAuthour.write_artist(*artist_data_pack) # key = db.Artist.key_maker(artist_data_pack) # pack = [name, born, country, about] driver.quit() return key else: driver.quit() return None else: return None
def artist_id_slave(self, artist_url): visited.discard(artist_url) soup = TheMiner.fetch_page(artist_url) if soup is not None: self.get_artist_data(soup, artist_url) # Getting the key from KEY_INFO if artist_url in KEY_INFO.keys(): key = KEY_INFO.get(artist_url) # Getting artist_id using the key from ARTIST_INFO if key in ARTIST_INFO.keys(): artist_id = ARTIST_INFO.get(key) return artist_id else: print("ARTIST_ID_SLAVE : Artist id not in ARTIST_INFO") return None else: print("ARTIST_ID_SLAVE : Could not find artist_id") return None else: print("ARTIST_ID_SLAVE : Soup not returned") return None
def get_artwork_listings_slave(self, url): soup = TheMiner.fetch_page(url, ghost=True) # Artist's info and artwork listings are available on the same page. if soup is not None: try: name = soup.find('div', class_='artist-intro').find('div', class_='content').h1.text # Name will cause the crash if the page is not returned block = soup.find_all('div', class_='artist-container artist-container--details') print(f"BLOCK : {len(block)}") try: for chunk in block: items = chunk.find_all('figure', class_='artwork-item artwork-item--details') print(f"ITEMS : {len(items)}") for piece in items: paise = piece.find('div', class_='meta').text.strip() # print(paise) if "Sold" not in str(paise): # print("B") a = piece.find('a')['href'] if self.website.domain not in a: a = self.link_maker(a) if a not in self.artwork_listings: self.artwork_listings.append(a) except AttributeError: # print("A") pass self.get_artist_data(soup, url) except AttributeError: print("B") # Urls that get blocked are discarded from visited and added to listy for a recall. (linear if listy is # small and multithreaded if listy is large enough till, its brought of size. visited.discard(url) self.listy.append(url)
def get_artwork_data_slave(url): soup = TheMiner.fetch_page(url, ghost=True) if soup is not None: # Initiation try: # Artist_url artist_url = soup.find('div', class_='artwork-focus').find_all('div', class_='col-md-12 col-lg-6') try: artist_url = artist_url[1].find('h2').a['href'] if self_website_domain not in artist_url: artist_url = self_link_maker(artist_url) except AttributeError: artist_url = None # Artist_id artist_id = self.artist_id except AttributeError: # Comes here if the page is not returned by the website. visited.discard(url) self_listy.append(url)
def get_seller_data(self, url): # Caller :: get_artwork_data_slave and get_seller_id # We get to here only after we do not find the seller's info in SELLER_INFO # print("GET SELLER DATA") visited.discard(url) soup = TheMiner.fetch_page(url) if soup is not None: # print("GET SELLER DATA: SOUP RETURNED") seller_name = None try: # Seller's Name seller_box = soup.find('div', id='jumpto--PartnerHeader') seller_name = seller_box.h1.text.strip() # print(seller_name) # Code will break if seller's name is not found except AttributeError: pass if seller_name is not None: # print(f"SELLER NAME : {seller_name}") # Location try: # Location is not available here. location = "" locatio = seller_box.h1.nextSibling() # print(type(locatio)) try: location = locatio.text except AttributeError: for l in locatio: location += l.text location += " " # print(location) except AttributeError: location = None except TypeError: location = None # Website try: website = soup.find_all('a') for web in website: if "http" in str(web.get('href')): website = web.get('href') print(web.get('href')) break # print(website) except AttributeError: website = None except IndexError: website = None bundle = [ url, self.website.platform, seller_name, location, website ] # print(bundle) TheAuthour.write_seller(*bundle)