def MtgGetCard(key, directory): key = key if len(key) > 0 else imput('Card to search: ') #Carregando a pagina para os metadados if key.isdigit(): page = urllib2.urlopen( 'http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid=' + str(key)) soup = BSHTML(page) else: page = urllib2.urlopen( 'http://gatherer.wizards.com/Pages/Search/Default.aspx?action=advanced&name=+["' + key.replace(' ', '%20') + '"]') soup = BSHTML(page) multiverse_id = 0 try: multiverse_id = int(page.url[page.url.rfind('=') + 1:]) except AttributeError: return False if multiverse_id == 0: return False #Carregando pagina para pegar legalidade e coleções ids_list = [] page = urllib2.urlopen( 'http://gatherer.wizards.com/Pages/Card/Printings.aspx?multiverseid=' + str(multiverse_id)) soup = BSHTML(page) try: for i in range(0, 100): a = soup.find( "a", { "id": "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_PrintingsList_listRepeater_ctl0" + str(i) + "_cardTitle" }) if a == None: break href = a["href"] try: muid = href[href.rfind("=") + 1:] except AttributeError: muid = 0 if muid > 0: ids_list.append(muid) except AttributeError: self.text = "" for muid in ids_list: card = MtgCard() card.search(muid, directory) print card
def downloadImage(self): img_url = '' try: if len(img_url) == 0 and hasattr(self, 'number') and hasattr(self, 'expansion_code'): page = urllib2.urlopen('https://magiccards.info/'+self.expansion_code.lower()+'/en/'+str(self.number)+'.html') soup = BSHTML(page) img_url = '' for img in soup.findAll("img"): if img['src'].find('scans') > -1: img_url = 'https://magiccards.info/'+img['src'] except urllib2.HTTPError: print '[404] https://magiccards.info/'+self.expansion_code.lower()+'/en/'+str(self.number)+'.html' except UnicodeDecodeError: print '[Unicode] https://magiccards.info/query?q='+self.name.replace(' ', '%20').replace('\'', '%27').replace('û', '%FB')+'+e%3A'+self.expansion_code+'%2Fen&v=card&s=cname' if len(img_url) == 0 and hasattr(self, 'multiverseid'): if hasattr(self, 'multiverseid'): img_url = 'http://gatherer.wizards.com/Handlers/Image.ashx?multiverseid='+str(self.multiverseid)+'&type=card' else: if hasattr(self, 'number'): print 'No MultiverseId: '+self.expansion_code.lower()+"/"+self.number+" ("+self.path+")" else: print 'No Metadata: '+self.path return False try: if len(img_url) == 0 and hasattr(self, 'name') and hasattr(self, 'expansion_code'): page = urllib2.urlopen('https://magiccards.info/query?q='+self.name.replace(' ', '%20').replace('\'', '%27').replace('û', '%FB')+'+e%3A'+self.expansion_code+'%2Fen&v=card&s=cname') soup = BSHTML(page) img_url = '' for img in soup.findAll("img"): if img['src'].find('scans') > -1: img_url = 'https://magiccards.info/'+img['src'] except urllib2.HTTPError: print '[404] https://magiccards.info/query?q='+self.name.replace(' ', '%20').replace('\'', '%27').replace('û', '%FB')+'+e%3A'+self.expansion_code+'%2Fen&v=card&s=cname' except UnicodeDecodeError: print '[Unicode] https://magiccards.info/query?q='+self.name.replace(' ', '%20').replace('\'', '%27')+'+e%3A'+self.expansion_code+'%2Fen&v=card&s=cname' try: self.loadFromImage() image = urllib.URLopener() image.retrieve(img_url,self.path) self.image_url = img_url self.saveMetadata() except: if hasattr(self, 'expansion_code'): print '404 Page not found: ['+self.expansion_code+'] '+self.path else: print '404 Page not found: '+self.path return False return True
def createNewsItem(): print("not app") data = requests.get("http://gromdroid.nl/bslim/wp-json/wp/v2/posts/" + request.args.get("id")).json() soup = BSHTML(data["content"]["rendered"]) images = soup.findAll('img') img = " " for image in images: img = image['src'] apiKey = "MDQ4ZjNmYmMtYTMxMy00MzMzLWI3NWUtNTI0NWQ1MDdlYmZk" appId = "88a41eb2-1403-4aa9-8989-c0b430286788" header = { "Content-Type": "application/json; charset=utf-8", "Authorization": "Basic " + apiKey } payload = { "app_id": appId, "included_segments": ["All"], "contents": { "en": "Nieuws van bslim" }, "headings": { "en": data["title"]["rendered"] } } req = requests.post("https://onesignal.com/api/v1/notifications", headers=header, data=json.dumps(payload)) return jsonify({ "responseCode": UserApi.createNewsItem(data["title"]["rendered"], data["content"]["rendered"], img) })
def getImage(response, ID): timeout = 0 #In case of the server refusing connection due to too many request. Increase the timeout. htmlText = response if(htmlText != 'Error'): soup = BSHTML(htmlText, features="lxml") images = soup.findAll('img', {'id': 'screenshot-image'}) for image in images: print('ID: '+ ID + ' source: ' + image['src']) #Show output with ID and image source. if('https' in image['src']): urllib.URLopener.version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11' urllib.urlretrieve(image['src'], ID +'.png') else: urllib.URLopener.version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11' urllib.urlretrieve('http:'+image['src'], ID +'.png') time.sleep(timeout) else: time.sleep(timeout)
def createNewsItem(): data = requests.get("http://gromdroid.nl/bslim/wp-json/wp/v2/posts/" + request.args.get("id")).json() soup = BSHTML(data["content"]["rendered"]) images = soup.findAll('video') img = " " for image in images: img = image['src'] apiKey = "YTFkZGY1OGUtNGM5NC00ODdmLWJmN2QtNjMxYzNjMzk0MWJl" appId = "893db161-0c60-438b-af84-8520b89c6d93" header = { "Content-Type": "application/json; charset=utf-8", "Authorization": "Basic " + apiKey } payload = { "app_id": appId, "included_segments": ["All"], "contents": { "en": "Nieuws van bslim" }, "headings": { "en": data.get('title') } } # req = requests.post("https://onesignal.com/api/v1/notifications", headers=header, data=json.dumps(payload)) return jsonify({ "responseCode": UserApi.createNewsItem(data["title"]["rendered"], data["content"]["rendered"], img) })
def BestBuy(): laptops = [] driver.get("https://www.bestbuy.com.mx/c/laptops/c41") content = driver.page_source soup = BSHTML(content, features="html.parser") pag = 0 while pag < 3: #cantidad de páginas con contenido for a in soup.findAll('div', attrs={'class': 'product-line-item-line'}): name = a.find('div', attrs={'class': 'product-title'}) laptops.append(name.text) driver.find_element_by_xpath( '//*[@id="plp-container"]/div/div[2]/div[2]/div[2]/div/div[4]/div[2]/ul/li[7]/a' ).click() print(len(laptops)) sleep(4) pag += 1 #print(len(laptops)) for lap in laptops: lap = lap.split("-") cels = BestBuyCels() teles = BestBuyTvs() df = pd.DataFrame({'Laptop Brand': [x for x in laptops[0]]}, {'Laptop Model': [x for x in laptops[0]]}) df2 = pd.DataFrame({'Cellphone Name': cels}) df3 = pd.DataFrame({'TV Name': teles}) df.to_csv('laptops.csv', index=False, encoding='utf-8') df2.to_csv('cels.csv', index=False, encoding='utf-8') df3.to_csv('tvs.csv', index=False, encoding='utf-8')
def getImageURL(siteURL): req = Request(siteURL, headers={'User-Agent': 'Mozilla/5.0'}) try: page = urlopen(req).read() # page = urllib.request.urlopen(siteURL) # page = urllib.request.urlopen(siteURL).read().decode('utf-8') soup = BSHTML(page, 'html.parser') # page_text = str(soup.prettify()) # decoded_page_text = codecs.decode(page_text, 'unicode-escape') # print(decoded_page_text) # print(page_text) # imageURL = re.search(r"^{\"\@context\":\"http:\/\/schema\.org\".+\"url\":\"https:\/\/www\.food\.com\"\}\}$", decoded_page_text).group(1) # print(imageURL) # with open("output.html", "w", encoding = 'utf-8') as file: # # prettify the soup object and convert it into a string # file.write(str(soup.prettify())) images = soup.find_all('meta', {"name": "og:image"}) if (len(images) > 0 and not images[0]['content'] == "https://geniuskitchen.sndimg.com/fdc-new/img/fdc-shareGraphic.png" ): return images[0]['content'] else: return "../static/images/recipe-placeholder-image.svg" except: print("Failed to fetch image.") return "../static/images/recipe-placeholder-image.svg"
def AmazonLaptops(): link1 = 'https://www.amazon.com.mx/gp/bestsellers/electronics/10189669011?ref_=Oct_s9_apbd_obs_hd_bw_bB7aoOB_S&pf_rd_r=5794MD68EN89CRQZWMDQ&pf_rd_p=58d7811c-7134-5551-b955-42726ceffed4&pf_rd_s=merchandised-search-10&pf_rd_t=BROWSE&pf_rd_i=10189669011' link2 = 'https://www.amazon.com.mx/gp/bestsellers/electronics/10189669011/ref=zg_bs_pg_2?ie=UTF8&pg=2' driver.get(link1) content = driver.page_source soup = BSHTML(content, features="html.parser") laptops = [] sleep(2) for a in soup.findAll( 'div', attrs={ 'class': 'p13n-sc-truncate-desktop-type2 p13n-sc-truncated' }): laptops.append(a.text) driver.get(link2) sleep(2) for a in soup.findAll( 'div', attrs={ 'class': 'p13n-sc-truncate-desktop-type2 p13n-sc-truncated' }): laptops.append(a.text) return laptops
def get_images(url): rating = url.split("/")[-3].split("-")[0] for year in range(2004, 2019): #2004-2018 (this is max range as of 2020) question = True try: r = requests.get(f"{url}{str(year)}") except: pass soup = BSHTML(r.content, features="lxml") images = soup.findAll("img") for image in images: if "individual-problems" in image.decode(): if question == True: data[rating[0].upper() + "MC"][str( year)]["questions"].append(base_url + str(image['src'])) question = False else: data[rating[0].upper() + "MC"][str(year)]["answers"].append(base_url + str(image['src'])) question = True else: pass
def BestBuyTvs(): tvs = [] driver.get("https://www.bestbuy.com.mx/c/pantallas/c35") content = driver.page_source soup = BSHTML(content, features="html.parser") pag = 0 while pag < 4: #cantidad de páginas con contenido for a in soup.findAll('div', attrs={'class': 'product-line-item-line'}): name = a.find('div', attrs={'class': 'product-title'}) tvs.append(name.text) driver.find_element_by_xpath( '//*[@id="plp-container"]/div/div[2]/div[2]/div[2]/div/div[4]/div[2]/ul/li[10]/a' ).click() print(len(tvs)) sleep(6) pag += 1 print(len(tvs)) return tvs
async def corona(ctx, state="delhi"): url = "https://www.mohfw.gov.in/" page = requests.get(url).text soup = BSHTML(page, features="html.parser") table_data = [[cell.text for cell in row("td")] for row in soup("tr")] table_data = table_data[1:] for row in table_data: try: if row[1].lower() == state.lower(): embed = discord.Embed( title="Corona Update for " + row[1], description="", color=0x00FF00, url=url, ) embed.add_field(name="Active Cases", value=row[2], inline=False) embed.add_field(name="Cured", value=row[3], inline=False) embed.add_field(name="Dead", value=row[4], inline=False) embed.add_field(name="Total Cases", value=row[5], inline=False) embed.set_footer(text="Data retrieved from " + url) await ctx.send(embed=embed) break except: await ctx.send("Incorrect argument.") break
def extract_images(message): soup = BSHTML(message, 'html.parser') imgs = soup.findAll('img') if len(imgs) > 0: for img in imgs: images.append(img['src'])
def update_wardrobe(self, file_id, permalink_public, file_description): res = requests.post( "https://slack.com/api/files.sharedPublicURL", { "token": "xoxp-434681964305-435150868323-479950531236-4283e3391cc6463655a5e91d375d1db3", "file": file_id }) if res.__getattribute__('ok'): page = urllib2.urlopen(permalink_public) soup = BSHTML(page) images = soup.findAll('img') discovery_json = self.recognize_image(images[0]['src'], file_description) environ = os.environ path = os.getcwd() + "\\data\\wardrobe\\" with open(os.path.join(path, file_id + '.json'), 'w') as json_file: json.dump(discovery_json, json_file) with open(os.path.join(path, file_id + '.json'), 'r') as f: data = f.read() self.wardrobe_discovery_client.add_document( environ.get('WARDROBE_DISCOVERY_ENVIRONMENT_ID'), environ.get('WARDROBE_DISCOVERY_COLLECTION_ID'), file=data, filename=file_id + '.json') else: raise Exception("Unable to upload picture. Please try again")
def cleanOriginalDocs(dir): ''' Given a directory containing the set of original documents from the DUC 2003 conference, renames and parses them into the format expected by our system. ''' # Rename directories _, dirs, _ = os.walk(dir).next() for subdir in dirs: ID = subdir[1:-1] newDir = os.path.join(dir, ID) oldDir = os.path.join(dir, subdir) os.rename(oldDir, newDir) for name in os.listdir(newDir): # Rename the documents themselves tmp = name.split('.') # print tmp fileID = tmp[0][3:] + tmp[1] + '.txt' newFile = os.path.join(newDir, fileID) oldFile = os.path.join(newDir, name) os.rename(oldFile, newFile) # Extract the text! with open(newFile, 'r') as txt: HTML = BSHTML(txt.read(), 'xml') text = HTML.TEXT.text.replace('\n', '') sentences = tokenizer.tokenize(text) with open(os.path.join(newDir, 'Parsed.' + fileID), 'w') as f: for s in sentences: f.write("{}\n".format(s))
def get_iframe_tags(self, data): soup = BSHTML(data) iframes = soup.findAll('iframe') if iframes: iframe = iframes[0] src = iframe['src'] return src return ''
def get_img_tags(self, data): soup = BSHTML(data) images = soup.findAll('img') if images: image = images[0] src = image['src'] return src return ''
def get_img_tags(url): # Get all images sorce from page print "Getting all src from img tags of " + url page = urllib2.urlopen(url) soup = BSHTML(page, "html.parser") #u can also use this -> BSHTML(page) images = soup.findAll('img') return len(images), images
def post_post_save_receiver(sender, instance, created, *arg, **kwargs): if created: soup = BSHTML(instance.content, "html.parser") images = soup.findAll('img') for image in images: if 'http' not in image['src']: picture = Picture(post_id=instance, image=image['src'].replace('/media/', '')) picture.save()
def getNoticeTitle(renglones): a=0 try: while "<h3>" not in renglones[a]: a+=1 encabezado =renglones[a]+" "+renglones[a+2] encabezadoBeauti = BSHTML(encabezado, 'lxml') return encabezadoBeauti.text except Exception as ex: print(ex) return "Error"
def post_post_save_receiver(sender, instance, created, *arg, **kwargs): if created: soup = BSHTML(instance.content, "html.parser") images = soup.findAll('img') for image in images: img_path = image['src'] if 'https://green-edu-link-v2.s3.amazonaws.com' in img_path: img_path = img_path.replace( 'https://green-edu-link-v2.s3.amazonaws.com/media/', '') picture = Picture(post_id=instance, image=img_path) picture.save()
def storeQuestionText(ques, curr_path, programming_Lang): ques_Text_File = curr_path + "Question_Text/" + str( ques['question_id']) + "_QT" + "_" + programming_Lang BS = BSHTML(ques['body']) f = open(ques_Text_File, "w") ques_Title = str(ques['title'].encode('utf8')) + "\n" + "----------" + "\n" f.write(ques_Title) for segment in BS.find_all('p'): #Storing the text in the file codeText = str(segment.get_text().encode('utf8')) f.write(codeText) f.close()
def stripFilms(filmRawData): """ Returns a files list containing stripped-down output from a letterboxd json file Keyword Arguments: filmRawData: Raw JSON letterboxd data. """ films = [] open('./error.json', 'w').close() for eachFilm in filmRawData["entries"]: try: rawHTML = BSHTML(eachFilm.get("summary"), 'html.parser') review = [] for eachPara in rawHTML.find_all('p')[1:]: for eachEle in eachPara.contents: review.append(str(eachEle)) review.append('<p/>') imgsrc = BSHTML(eachFilm.get("summary"), 'html.parser').find('img')['src'] films.append({ "title": eachFilm.get("letterboxd_filmtitle"), "year": eachFilm.get("letterboxd_filmyear"), "link": eachFilm.get("link"), "watcheddate": eachFilm.get("letterboxd_watcheddate"), "rewatch": eachFilm.get("letterboxd_rewatch"), "rating": eachFilm.get("letterboxd_memberrating"), "summary": review, "imgsrc": imgsrc, }) except: with open('./error.json', 'a') as f: e = sys.exc_info() f.write(str(e) + json.dumps(eachFilm) + '\n') return films
def storeAnswerText(ques, curr_path, programming_Lang): answer_ID = 1 for ans in ques['answers']: BS = BSHTML(ans['body']) if BS.find_all('p'): ans_Text_File = curr_path + "Answer_Text/" + str( ques['question_id']) + "_AT" + str( answer_ID) + "_" + programming_Lang answer_ID = answer_ID + 1 f = open(ans_Text_File, "w") for segment in (BS.find_all('p')): codeText = str(segment.get_text().encode('utf8')) f.write(codeText) f.close()
def downloadPrice(self): price = 'R$ 0,00' try: page = urllib2.urlopen('https://www.ligamagic.com.br/?view=cards%2Fsearch&card='+self.name.replace(' ','+')) soup = BSHTML(page) for div in soup.findAll("div", {'id' : 'precos-medio'}): price = div.text except urllib2.HTTPError: print '[404] https://www.ligamagic.com.br/?view=cards%2Fsearch&card='+self.name.replace(' ','+') self.loadFromImage() self.price = price self.saveMetadata() return True
def Amazon(): link1 = 'https://www.amazon.com.mx/gp/bestsellers/electronics/9687458011?ref_=Oct_s9_apbd_obs_hd_bw_bAZbaMl_S&pf_rd_r=HZ45SGH8T7GKZ74AS33S&pf_rd_p=4d9d93c0-fea5-5ed3-9cdc-da3baf21c408&pf_rd_s=merchandised-search-10&pf_rd_t=BROWSE&pf_rd_i=9687458011' link2 = 'https://www.amazon.com.mx/gp/bestsellers/electronics/9687458011/ref=zg_bs_pg_2/132-1166954-1513655?ie=UTF8&pg=2' driver.get(link1) content = driver.page_source soup = BSHTML(content, features="html.parser") celulares = [] #celularesimg = [] sleep(2) for a in soup.findAll('div', attrs={'class': 'p13n-sc-truncated'}): #print(a.text) celulares.append(a.text) driver.get(link2) content = driver.page_source soup = BSHTML(content, features="html.parser") sleep(2) for a in soup.findAll('div', attrs={'class': 'p13n-sc-truncated'}): #print(a.text) celulares.append(a.text) laps = AmazonLaptops() #tvs = Amazontvs() print(len(celulares), 'cels') print(len(laps), 'laps') #print(len(tvs), 'tvs') if (((len(celulares)) > 0) and (len(laps) > 0)): df = pd.DataFrame({'Cellphone Name': celulares}) df.to_csv('celsAmazon.csv', index=False, encoding='utf-8') df2 = pd.DataFrame({'Laptop Name': laps}) df2.to_csv('lapsAmazon.csv', index=False, encoding='utf-8')
def GetAmazonImgs(link): driver.get(link) content = driver.page_source soup = BSHTML(content, features="html.parser") images = soup.findAll('img') imgsrc = [] imgalt = [] for image in images: print(image['src']) imgsrc.append(image['src']) try: imgalt.append(image['alt']) except: imgalt.append('imagealt') return imgsrc, imgalt
def storeAnswerCode(ques, curr_path, programming_Lang): answer_ID = 1 for ans in ques['answers']: BS = BSHTML(ans['body']) if BS.find_all('pre'): answer_ID = answer_ID + 1 ans_segment = 1 for segment in (BS.find_all('pre')): ans_Code_File = curr_path + "Answer_Code/" + str( ques['question_id']) + "_AC" + str(answer_ID) + "_" + str( ans_segment) + "." + programming_Lang f = open(ans_Code_File, "w") ans_segment = ans_segment + 1 codeText = str(segment.get_text().encode('utf8')) f.write(codeText) f.close()
def __call__(self, request): # Code to be executed for each request before # the view (and later middleware) are called. response = self.get_response(request) try: contenttype = response['Content-Type'] except: contenttype = "" if contenttype == "text/html; charset=utf-8": response_text = str(response.content, encoding='UTF-8') # response_text=response_text.replace('and','xxx') soup = BSHTML(response_text, features="lxml") images = soup.findAll('img') title = "" alt = "" for image in images: try: alt = image['alt'] except: if not alt: alt = image['src'].replace("/media/img/", "").replace( ".jpg", "").replace("_", " ").replace("-", " ") try: title = (image['title']) except: if not title: image['title'] = alt response_text = str(soup) response.content = bytes(response_text, encoding='UTF-8') for i in self.agents: if i in request.META['HTTP_USER_AGENT']: print(request.META['HTTP_USER_AGENT']) print(i) response.content = bytes("", encoding='UTF-8') # Code to be executed for each request/response after # the view is called. return response
def storeQuestionCode(ques, curr_path, programming_Lang): #getting the code part from the user question body BS = BSHTML(ques['body']) ques_segment = 1 #codeText=str(BS.find_all('code')) EntireCodeText = " " for segment in BS.find_all('pre'): ques_Code_File = curr_path + "Question_Code/" + str( ques['question_id']) + "_QC_" + str( ques_segment) + "." + programming_Lang f = open(ques_Code_File, "w") ques_segment = ques_segment + 1 codeText = str(segment.get_text().encode('utf8')) EntireCodeText = EntireCodeText + codeText + "\n" #Storing the code text in the file f.write(codeText) f.close() parseQuestionCode(ques, curr_path, programming_Lang, EntireCodeText)
def download_one(i, row, save_folder): try: page = urllib.request.urlopen(row["url"]) soup = BSHTML(page) images = soup.findAll("img") except: print("broken link") return for image in images: print(image["src"]) try: row["labels"] = row["labels"].strip("'")[1:].replace(" ", "_") print(row) f = open( save_folder + "{}_{}.png".format(row["labels"].strip("'"), i), "wb" ) f.write(requests.get(image["src"]).content) f.close() except: return return