Python BSHTML Examples, bs4.BSHTML Python Examples

Example #1

0

Show file

File: MtgGet.py Project: witoreel/Mtg

def MtgGetCard(key, directory):
    key = key if len(key) > 0 else imput('Card to search: ')

    #Carregando a pagina para os metadados
    if key.isdigit():
        page = urllib2.urlopen(
            'http://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid='
            + str(key))
        soup = BSHTML(page)
    else:
        page = urllib2.urlopen(
            'http://gatherer.wizards.com/Pages/Search/Default.aspx?action=advanced&name=+["'
            + key.replace(' ', '%20') + '"]')
        soup = BSHTML(page)

    multiverse_id = 0
    try:
        multiverse_id = int(page.url[page.url.rfind('=') + 1:])
    except AttributeError:
        return False

    if multiverse_id == 0:
        return False

    #Carregando pagina para pegar legalidade e coleções
    ids_list = []
    page = urllib2.urlopen(
        'http://gatherer.wizards.com/Pages/Card/Printings.aspx?multiverseid=' +
        str(multiverse_id))
    soup = BSHTML(page)
    try:
        for i in range(0, 100):
            a = soup.find(
                "a", {
                    "id":
                    "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_PrintingsList_listRepeater_ctl0"
                    + str(i) + "_cardTitle"
                })
            if a == None:
                break
            href = a["href"]
            try:
                muid = href[href.rfind("=") + 1:]
            except AttributeError:
                muid = 0

            if muid > 0:
                ids_list.append(muid)
    except AttributeError:
        self.text = ""

    for muid in ids_list:
        card = MtgCard()
        card.search(muid, directory)
        print card

Example #2

0

Show file

File: MtgCard.py Project: witoreel/Mtg

	def downloadImage(self):
		
		img_url = ''
		try:
			if len(img_url) == 0 and hasattr(self, 'number') and hasattr(self, 'expansion_code'):
				page = urllib2.urlopen('https://magiccards.info/'+self.expansion_code.lower()+'/en/'+str(self.number)+'.html')
				soup = BSHTML(page)
				img_url = ''
				for img in soup.findAll("img"):
					if img['src'].find('scans') > -1:
						img_url = 'https://magiccards.info/'+img['src']
		except urllib2.HTTPError:
			print '[404] https://magiccards.info/'+self.expansion_code.lower()+'/en/'+str(self.number)+'.html'
		except UnicodeDecodeError:
			print '[Unicode] https://magiccards.info/query?q='+self.name.replace(' ', '%20').replace('\'', '%27').replace('û', '%FB')+'+e%3A'+self.expansion_code+'%2Fen&v=card&s=cname'
				
		if len(img_url) == 0 and hasattr(self, 'multiverseid'):
			if hasattr(self, 'multiverseid'):
				img_url = 'http://gatherer.wizards.com/Handlers/Image.ashx?multiverseid='+str(self.multiverseid)+'&type=card'
			else:
				if hasattr(self, 'number'):
					print 'No MultiverseId: '+self.expansion_code.lower()+"/"+self.number+" ("+self.path+")"
				else:
					print 'No Metadata: '+self.path
				return False

		try:
			if len(img_url) == 0 and hasattr(self, 'name') and hasattr(self, 'expansion_code'):
				page = urllib2.urlopen('https://magiccards.info/query?q='+self.name.replace(' ', '%20').replace('\'', '%27').replace('û', '%FB')+'+e%3A'+self.expansion_code+'%2Fen&v=card&s=cname')
				soup = BSHTML(page)
				img_url = ''
				for img in soup.findAll("img"):
					if img['src'].find('scans') > -1:
						img_url = 'https://magiccards.info/'+img['src']
		except urllib2.HTTPError:
			print '[404] https://magiccards.info/query?q='+self.name.replace(' ', '%20').replace('\'', '%27').replace('û', '%FB')+'+e%3A'+self.expansion_code+'%2Fen&v=card&s=cname'
		except UnicodeDecodeError:
			print '[Unicode] https://magiccards.info/query?q='+self.name.replace(' ', '%20').replace('\'', '%27')+'+e%3A'+self.expansion_code+'%2Fen&v=card&s=cname'

		try:
			self.loadFromImage()
			image = urllib.URLopener()
			image.retrieve(img_url,self.path)

			self.image_url = img_url
			self.saveMetadata()
		except:
			if hasattr(self, 'expansion_code'):
				print '404 Page not found: ['+self.expansion_code+'] '+self.path
			else:
				print '404 Page not found: '+self.path
			return False
		return True

Example #3

0

Show file

File: Server.py Project: Bslimste/backend

def createNewsItem():
    print("not app")
    data = requests.get("http://gromdroid.nl/bslim/wp-json/wp/v2/posts/" +
                        request.args.get("id")).json()
    soup = BSHTML(data["content"]["rendered"])
    images = soup.findAll('img')
    img = " "
    for image in images:
        img = image['src']

    apiKey = "MDQ4ZjNmYmMtYTMxMy00MzMzLWI3NWUtNTI0NWQ1MDdlYmZk"
    appId = "88a41eb2-1403-4aa9-8989-c0b430286788"
    header = {
        "Content-Type": "application/json; charset=utf-8",
        "Authorization": "Basic " + apiKey
    }

    payload = {
        "app_id": appId,
        "included_segments": ["All"],
        "contents": {
            "en": "Nieuws van bslim"
        },
        "headings": {
            "en": data["title"]["rendered"]
        }
    }
    req = requests.post("https://onesignal.com/api/v1/notifications",
                        headers=header,
                        data=json.dumps(payload))
    return jsonify({
        "responseCode":
        UserApi.createNewsItem(data["title"]["rendered"],
                               data["content"]["rendered"], img)
    })

Example #4

0

Show file

File: dumpPrnt.sc.py Project: SyntaxSugarApps/Dump-all-pictures-from-https-prnt.sc

def getImage(response, ID):

	timeout = 0 #In case of the server refusing connection due to too many request. Increase the timeout. 
	htmlText = response

	if(htmlText != 'Error'):

		soup = BSHTML(htmlText, features="lxml")
		images = soup.findAll('img', {'id': 'screenshot-image'})

		for image in images:

			print('ID: '+ ID + ' source: ' + image['src']) #Show output with ID and image source.

			if('https' in image['src']):
				urllib.URLopener.version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
				urllib.urlretrieve(image['src'], ID +'.png')

			else:
				urllib.URLopener.version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11'
				urllib.urlretrieve('http:'+image['src'], ID +'.png')

		time.sleep(timeout) 

	else:
		time.sleep(timeout)

Example #5

0

Show file

File: Server.py Project: bertjedb/backend---1

def createNewsItem():
    data = requests.get("http://gromdroid.nl/bslim/wp-json/wp/v2/posts/" +
                        request.args.get("id")).json()
    soup = BSHTML(data["content"]["rendered"])
    images = soup.findAll('video')
    img = " "
    for image in images:
        img = image['src']

    apiKey = "YTFkZGY1OGUtNGM5NC00ODdmLWJmN2QtNjMxYzNjMzk0MWJl"
    appId = "893db161-0c60-438b-af84-8520b89c6d93"
    header = {
        "Content-Type": "application/json; charset=utf-8",
        "Authorization": "Basic " + apiKey
    }

    payload = {
        "app_id": appId,
        "included_segments": ["All"],
        "contents": {
            "en": "Nieuws van bslim"
        },
        "headings": {
            "en": data.get('title')
        }
    }

    # req = requests.post("https://onesignal.com/api/v1/notifications", headers=header, data=json.dumps(payload))
    return jsonify({
        "responseCode":
        UserApi.createNewsItem(data["title"]["rendered"],
                               data["content"]["rendered"], img)
    })

Example #6

0

Show file

File: webscraper_netcore.py Project: binary-hideout/WebScraper

def BestBuy():
    laptops = []

    driver.get("https://www.bestbuy.com.mx/c/laptops/c41")

    content = driver.page_source
    soup = BSHTML(content, features="html.parser")

    pag = 0
    while pag < 3:  #cantidad de páginas con contenido
        for a in soup.findAll('div', attrs={'class':
                                            'product-line-item-line'}):
            name = a.find('div', attrs={'class': 'product-title'})
            laptops.append(name.text)
        driver.find_element_by_xpath(
            '//*[@id="plp-container"]/div/div[2]/div[2]/div[2]/div/div[4]/div[2]/ul/li[7]/a'
        ).click()
        print(len(laptops))
        sleep(4)
        pag += 1

    #print(len(laptops))
    for lap in laptops:
        lap = lap.split("-")
    cels = BestBuyCels()
    teles = BestBuyTvs()
    df = pd.DataFrame({'Laptop Brand': [x for x in laptops[0]]},
                      {'Laptop Model': [x for x in laptops[0]]})
    df2 = pd.DataFrame({'Cellphone Name': cels})
    df3 = pd.DataFrame({'TV Name': teles})
    df.to_csv('laptops.csv', index=False, encoding='utf-8')
    df2.to_csv('cels.csv', index=False, encoding='utf-8')
    df3.to_csv('tvs.csv', index=False, encoding='utf-8')

Example #7

0

Show file

def getImageURL(siteURL):
    req = Request(siteURL, headers={'User-Agent': 'Mozilla/5.0'})
    try:
        page = urlopen(req).read()

        # page = urllib.request.urlopen(siteURL)
        # page = urllib.request.urlopen(siteURL).read().decode('utf-8')
        soup = BSHTML(page, 'html.parser')
        # page_text = str(soup.prettify())
        # decoded_page_text = codecs.decode(page_text, 'unicode-escape')
        # print(decoded_page_text)
        # print(page_text)
        # imageURL = re.search(r"^{\"\@context\":\"http:\/\/schema\.org\".+\"url\":\"https:\/\/www\.food\.com\"\}\}$", decoded_page_text).group(1)
        # print(imageURL)
        # with open("output.html", "w", encoding = 'utf-8') as file:
        # # prettify the soup object and convert it into a string
        #     file.write(str(soup.prettify()))
        images = soup.find_all('meta', {"name": "og:image"})
        if (len(images) > 0 and not images[0]['content'] ==
                "https://geniuskitchen.sndimg.com/fdc-new/img/fdc-shareGraphic.png"
            ):
            return images[0]['content']
        else:
            return "../static/images/recipe-placeholder-image.svg"
    except:
        print("Failed to fetch image.")
        return "../static/images/recipe-placeholder-image.svg"

Example #8

0

Show file

File: webscraper_netcore.py Project: binary-hideout/WebScraper

def AmazonLaptops():
    link1 = 'https://www.amazon.com.mx/gp/bestsellers/electronics/10189669011?ref_=Oct_s9_apbd_obs_hd_bw_bB7aoOB_S&pf_rd_r=5794MD68EN89CRQZWMDQ&pf_rd_p=58d7811c-7134-5551-b955-42726ceffed4&pf_rd_s=merchandised-search-10&pf_rd_t=BROWSE&pf_rd_i=10189669011'
    link2 = 'https://www.amazon.com.mx/gp/bestsellers/electronics/10189669011/ref=zg_bs_pg_2?ie=UTF8&pg=2'
    driver.get(link1)
    content = driver.page_source
    soup = BSHTML(content, features="html.parser")
    laptops = []
    sleep(2)

    for a in soup.findAll(
            'div',
            attrs={
                'class': 'p13n-sc-truncate-desktop-type2 p13n-sc-truncated'
            }):
        laptops.append(a.text)

    driver.get(link2)
    sleep(2)

    for a in soup.findAll(
            'div',
            attrs={
                'class': 'p13n-sc-truncate-desktop-type2 p13n-sc-truncated'
            }):
        laptops.append(a.text)

    return laptops

Example #9

0

Show file

File: json_script.py Project: JamBot3000/UKMT-scrape

def get_images(url):

    rating = url.split("/")[-3].split("-")[0]

    for year in range(2004, 2019):  #2004-2018 (this is max range as of 2020)

        question = True

        try:
            r = requests.get(f"{url}{str(year)}")
        except:
            pass

        soup = BSHTML(r.content, features="lxml")
        images = soup.findAll("img")

        for image in images:
            if "individual-problems" in image.decode():
                if question == True:
                    data[rating[0].upper() + "MC"][str(
                        year)]["questions"].append(base_url +
                                                   str(image['src']))
                    question = False
                else:
                    data[rating[0].upper() +
                         "MC"][str(year)]["answers"].append(base_url +
                                                            str(image['src']))
                    question = True
            else:
                pass

Example #10

0

Show file

File: webscraper_netcore.py Project: binary-hideout/WebScraper

def BestBuyTvs():
    tvs = []

    driver.get("https://www.bestbuy.com.mx/c/pantallas/c35")

    content = driver.page_source
    soup = BSHTML(content, features="html.parser")

    pag = 0
    while pag < 4:  #cantidad de páginas con contenido

        for a in soup.findAll('div', attrs={'class':
                                            'product-line-item-line'}):
            name = a.find('div', attrs={'class': 'product-title'})
            tvs.append(name.text)

        driver.find_element_by_xpath(
            '//*[@id="plp-container"]/div/div[2]/div[2]/div[2]/div/div[4]/div[2]/ul/li[10]/a'
        ).click()
        print(len(tvs))
        sleep(6)
        pag += 1
    print(len(tvs))

    return tvs

Example #11

0

Show file

File: bot.py Project: harshitgarg22/daily-updates

async def corona(ctx, state="delhi"):
    url = "https://www.mohfw.gov.in/"
    page = requests.get(url).text
    soup = BSHTML(page, features="html.parser")

    table_data = [[cell.text for cell in row("td")] for row in soup("tr")]
    table_data = table_data[1:]

    for row in table_data:
        try:
            if row[1].lower() == state.lower():
                embed = discord.Embed(
                    title="Corona Update for " + row[1],
                    description="",
                    color=0x00FF00,
                    url=url,
                )
                embed.add_field(name="Active Cases",
                                value=row[2],
                                inline=False)
                embed.add_field(name="Cured", value=row[3], inline=False)
                embed.add_field(name="Dead", value=row[4], inline=False)
                embed.add_field(name="Total Cases", value=row[5], inline=False)
                embed.set_footer(text="Data retrieved from " + url)
                await ctx.send(embed=embed)
                break
        except:
            await ctx.send("Incorrect argument.")
            break

Example #12

0

Show file

def extract_images(message):
    soup = BSHTML(message, 'html.parser')
    imgs = soup.findAll('img')

    if len(imgs) > 0:
        for img in imgs:
            images.append(img['src'])

Example #13

0

Show file

File: watson_style_me.py Project: pavani1092/StyleMe

 def update_wardrobe(self, file_id, permalink_public, file_description):
     res = requests.post(
         "https://slack.com/api/files.sharedPublicURL", {
             "token":
             "xoxp-434681964305-435150868323-479950531236-4283e3391cc6463655a5e91d375d1db3",
             "file": file_id
         })
     if res.__getattribute__('ok'):
         page = urllib2.urlopen(permalink_public)
         soup = BSHTML(page)
         images = soup.findAll('img')
         discovery_json = self.recognize_image(images[0]['src'],
                                               file_description)
         environ = os.environ
         path = os.getcwd() + "\\data\\wardrobe\\"
         with open(os.path.join(path, file_id + '.json'), 'w') as json_file:
             json.dump(discovery_json, json_file)
         with open(os.path.join(path, file_id + '.json'), 'r') as f:
             data = f.read()
         self.wardrobe_discovery_client.add_document(
             environ.get('WARDROBE_DISCOVERY_ENVIRONMENT_ID'),
             environ.get('WARDROBE_DISCOVERY_COLLECTION_ID'),
             file=data,
             filename=file_id + '.json')
     else:
         raise Exception("Unable to upload picture. Please try again")

Example #14

0

Show file

File: utils.py Project: kandluis/document_summaries

def cleanOriginalDocs(dir):
    '''
    Given a directory containing the set of original documents from the
    DUC 2003 conference, renames and parses them into the format expected
    by our system.
    '''
    # Rename directories
    _, dirs, _ = os.walk(dir).next()
    for subdir in dirs:
        ID = subdir[1:-1]
        newDir = os.path.join(dir, ID)
        oldDir = os.path.join(dir, subdir)
        os.rename(oldDir, newDir)

        for name in os.listdir(newDir):
            # Rename the documents themselves
            tmp = name.split('.')
            # print tmp
            fileID = tmp[0][3:] + tmp[1] + '.txt'
            newFile = os.path.join(newDir, fileID)
            oldFile = os.path.join(newDir, name)
            os.rename(oldFile, newFile)

            # Extract the text!
            with open(newFile, 'r') as txt:
                HTML = BSHTML(txt.read(), 'xml')
                text = HTML.TEXT.text.replace('\n', '')
                sentences = tokenizer.tokenize(text)
                with open(os.path.join(newDir, 'Parsed.' + fileID), 'w') as f:
                    for s in sentences:
                        f.write("{}\n".format(s))

Example #15

0

Show file

 def get_iframe_tags(self, data):
     soup = BSHTML(data)
     iframes = soup.findAll('iframe')
     if iframes:
         iframe = iframes[0]
         src = iframe['src']
         return src
     return ''

Example #16

0

Show file

 def get_img_tags(self, data):
     soup = BSHTML(data)
     images = soup.findAll('img')
     if images:
         image = images[0]
         src = image['src']
         return src
     return ''

Example #17

0

Show file

def get_img_tags(url):
    # Get all images sorce from page
    print "Getting all src from img tags of " + url
    page = urllib2.urlopen(url)
    soup = BSHTML(page, "html.parser")  #u can also use this -> BSHTML(page)
    images = soup.findAll('img')

    return len(images), images

Example #18

0

Show file

File: models.py Project: HuyNguyen260398/GreenEduLink

def post_post_save_receiver(sender, instance, created, *arg, **kwargs):
    if created:
        soup = BSHTML(instance.content, "html.parser")
        images = soup.findAll('img')
        for image in images:
            if 'http' not in image['src']:
                picture = Picture(post_id=instance,
                                  image=image['src'].replace('/media/', ''))
                picture.save()

Example #19

0

Show file

File: divideHtmlEnArts.py Project: iangelmx/NaturalLanguageProcessing

def getNoticeTitle(renglones):
	a=0
	try:
		while "<h3>" not in renglones[a]:
			a+=1
		encabezado =renglones[a]+" "+renglones[a+2]
		encabezadoBeauti = BSHTML(encabezado, 'lxml')
		return encabezadoBeauti.text
	except Exception as ex:
		print(ex)
	return "Error"

Example #20

0

Show file

File: models.py Project: HuyNguyen260398/Green-Edu-Link-V2

def post_post_save_receiver(sender, instance, created, *arg, **kwargs):
    if created:
        soup = BSHTML(instance.content, "html.parser")
        images = soup.findAll('img')

        for image in images:
            img_path = image['src']

            if 'https://green-edu-link-v2.s3.amazonaws.com' in img_path:
                img_path = img_path.replace(
                    'https://green-edu-link-v2.s3.amazonaws.com/media/', '')
                picture = Picture(post_id=instance, image=img_path)
                picture.save()

Example #21

0

Show file

File: StackOverflow_dataFetcher.py Project: MikhailChebotar/StackoverflowAnalyzer

def storeQuestionText(ques, curr_path, programming_Lang):
    ques_Text_File = curr_path + "Question_Text/" + str(
        ques['question_id']) + "_QT" + "_" + programming_Lang
    BS = BSHTML(ques['body'])
    f = open(ques_Text_File, "w")
    ques_Title = str(ques['title'].encode('utf8')) + "\n" + "----------" + "\n"
    f.write(ques_Title)
    for segment in BS.find_all('p'):
        #Storing the text in the file
        codeText = str(segment.get_text().encode('utf8'))
        f.write(codeText)

    f.close()

Example #22

0

Show file

File: stripFilms.py Project: harshitgarg22/harshitgarg.in

def stripFilms(filmRawData):
    """
        Returns a files list containing stripped-down output from a letterboxd json file
    
        Keyword Arguments:
        filmRawData: Raw JSON letterboxd data.
    """

    films = []
    open('./error.json', 'w').close()
    for eachFilm in filmRawData["entries"]:
        try:
            rawHTML = BSHTML(eachFilm.get("summary"), 'html.parser')
            
            review = []
            for eachPara in rawHTML.find_all('p')[1:]:
                for eachEle in eachPara.contents:
                    review.append(str(eachEle))
                review.append('<p/>')

            imgsrc = BSHTML(eachFilm.get("summary"), 'html.parser').find('img')['src']
            
            films.append({
            "title": eachFilm.get("letterboxd_filmtitle"),
            "year": eachFilm.get("letterboxd_filmyear"), 
            "link": eachFilm.get("link"),
            "watcheddate": eachFilm.get("letterboxd_watcheddate"),
            "rewatch": eachFilm.get("letterboxd_rewatch"),
            "rating": eachFilm.get("letterboxd_memberrating"),
            "summary": review,
            "imgsrc": imgsrc,
            })
        except:
            with open('./error.json', 'a') as f:
                e = sys.exc_info()
                f.write(str(e) + json.dumps(eachFilm) + '\n')

    return films

Example #23

0

Show file

File: StackOverflow_dataFetcher.py Project: MikhailChebotar/StackoverflowAnalyzer

def storeAnswerText(ques, curr_path, programming_Lang):
    answer_ID = 1
    for ans in ques['answers']:
        BS = BSHTML(ans['body'])
        if BS.find_all('p'):
            ans_Text_File = curr_path + "Answer_Text/" + str(
                ques['question_id']) + "_AT" + str(
                    answer_ID) + "_" + programming_Lang
            answer_ID = answer_ID + 1
            f = open(ans_Text_File, "w")
            for segment in (BS.find_all('p')):
                codeText = str(segment.get_text().encode('utf8'))
                f.write(codeText)
            f.close()

Example #24

0

Show file

File: MtgCard.py Project: witoreel/Mtg

	def downloadPrice(self):
		
		price = 'R$ 0,00'
		try:
			page = urllib2.urlopen('https://www.ligamagic.com.br/?view=cards%2Fsearch&card='+self.name.replace(' ','+'))
			soup = BSHTML(page)
			for div in soup.findAll("div", {'id' : 'precos-medio'}):
				price = div.text			
		except urllib2.HTTPError:
			print '[404] https://www.ligamagic.com.br/?view=cards%2Fsearch&card='+self.name.replace(' ','+')
				
		self.loadFromImage()
		self.price = price
		self.saveMetadata()
		return True

Example #25

0

Show file

File: webscraper_netcore.py Project: binary-hideout/WebScraper

def Amazon():

    link1 = 'https://www.amazon.com.mx/gp/bestsellers/electronics/9687458011?ref_=Oct_s9_apbd_obs_hd_bw_bAZbaMl_S&pf_rd_r=HZ45SGH8T7GKZ74AS33S&pf_rd_p=4d9d93c0-fea5-5ed3-9cdc-da3baf21c408&pf_rd_s=merchandised-search-10&pf_rd_t=BROWSE&pf_rd_i=9687458011'
    link2 = 'https://www.amazon.com.mx/gp/bestsellers/electronics/9687458011/ref=zg_bs_pg_2/132-1166954-1513655?ie=UTF8&pg=2'
    driver.get(link1)
    content = driver.page_source
    soup = BSHTML(content, features="html.parser")
    celulares = []
    #celularesimg = []
    sleep(2)
    for a in soup.findAll('div', attrs={'class': 'p13n-sc-truncated'}):

        #print(a.text)
        celulares.append(a.text)

    driver.get(link2)
    content = driver.page_source
    soup = BSHTML(content, features="html.parser")
    sleep(2)

    for a in soup.findAll('div', attrs={'class': 'p13n-sc-truncated'}):

        #print(a.text)
        celulares.append(a.text)

    laps = AmazonLaptops()
    #tvs = Amazontvs()
    print(len(celulares), 'cels')
    print(len(laps), 'laps')
    #print(len(tvs), 'tvs')

    if (((len(celulares)) > 0) and (len(laps) > 0)):
        df = pd.DataFrame({'Cellphone Name': celulares})
        df.to_csv('celsAmazon.csv', index=False, encoding='utf-8')
        df2 = pd.DataFrame({'Laptop Name': laps})
        df2.to_csv('lapsAmazon.csv', index=False, encoding='utf-8')

Example #26

0

Show file

File: webscraper_netcore.py Project: binary-hideout/WebScraper

def GetAmazonImgs(link):
    driver.get(link)
    content = driver.page_source
    soup = BSHTML(content, features="html.parser")
    images = soup.findAll('img')

    imgsrc = []
    imgalt = []
    for image in images:
        print(image['src'])
        imgsrc.append(image['src'])
        try:
            imgalt.append(image['alt'])
        except:
            imgalt.append('imagealt')

    return imgsrc, imgalt

Example #27

0

Show file

File: StackOverflow_dataFetcher.py Project: MikhailChebotar/StackoverflowAnalyzer

def storeAnswerCode(ques, curr_path, programming_Lang):
    answer_ID = 1

    for ans in ques['answers']:
        BS = BSHTML(ans['body'])
        if BS.find_all('pre'):
            answer_ID = answer_ID + 1
            ans_segment = 1
            for segment in (BS.find_all('pre')):
                ans_Code_File = curr_path + "Answer_Code/" + str(
                    ques['question_id']) + "_AC" + str(answer_ID) + "_" + str(
                        ans_segment) + "." + programming_Lang
                f = open(ans_Code_File, "w")
                ans_segment = ans_segment + 1
                codeText = str(segment.get_text().encode('utf8'))
                f.write(codeText)
                f.close()

Example #28

0

Show file

    def __call__(self, request):
        # Code to be executed for each request before
        # the view (and later middleware) are called.
        response = self.get_response(request)
        try:
            contenttype = response['Content-Type']
        except:
            contenttype = ""

        if contenttype == "text/html; charset=utf-8":
            response_text = str(response.content, encoding='UTF-8')
            #            response_text=response_text.replace('and','xxx')
            soup = BSHTML(response_text, features="lxml")
            images = soup.findAll('img')
            title = ""
            alt = ""
            for image in images:

                try:
                    alt = image['alt']
                except:
                    if not alt:
                        alt = image['src'].replace("/media/img/", "").replace(
                            ".jpg", "").replace("_", " ").replace("-", " ")

                try:
                    title = (image['title'])
                except:
                    if not title:
                        image['title'] = alt

            response_text = str(soup)
            response.content = bytes(response_text, encoding='UTF-8')

        for i in self.agents:
            if i in request.META['HTTP_USER_AGENT']:
                print(request.META['HTTP_USER_AGENT'])
                print(i)
                response.content = bytes("", encoding='UTF-8')
            # Code to be executed for each request/response after
        # the view is called.

        return response

Example #29

0

Show file

File: StackOverflow_dataFetcher.py Project: MikhailChebotar/StackoverflowAnalyzer

def storeQuestionCode(ques, curr_path, programming_Lang):
    #getting the code part from the user question body

    BS = BSHTML(ques['body'])
    ques_segment = 1
    #codeText=str(BS.find_all('code'))

    EntireCodeText = " "
    for segment in BS.find_all('pre'):
        ques_Code_File = curr_path + "Question_Code/" + str(
            ques['question_id']) + "_QC_" + str(
                ques_segment) + "." + programming_Lang
        f = open(ques_Code_File, "w")
        ques_segment = ques_segment + 1
        codeText = str(segment.get_text().encode('utf8'))
        EntireCodeText = EntireCodeText + codeText + "\n"
        #Storing the code text in the file
        f.write(codeText)
        f.close()

    parseQuestionCode(ques, curr_path, programming_Lang, EntireCodeText)

Example #30

0

Show file

File: crawl.py Project: kix2mix2/DRflow

def download_one(i, row, save_folder):
    try:
        page = urllib.request.urlopen(row["url"])
        soup = BSHTML(page)
        images = soup.findAll("img")
    except:
        print("broken link")
        return

    for image in images:
        print(image["src"])
        try:
            row["labels"] = row["labels"].strip("'")[1:].replace(" ", "_")
            print(row)
            f = open(
                save_folder + "{}_{}.png".format(row["labels"].strip("'"), i), "wb"
            )
            f.write(requests.get(image["src"]).content)
            f.close()
        except:
            return
        return