def comment():

		url = 'http://www.espncricinfo.com/australia-v-india-2015-16/engine/match/895815.html?innings='
		url+=str(Team_id)
		url+=';view=commentary'
		source_code = requests.get(url)   # source code of page
		plain_text = source_code.text
		soup = BeautifulSoup(plain_text)
		soup.encode('UTF-8')
		for link in soup.findAll('div' , {'class' : 'commentary-text'} ):      # a = find all links of the titles 
			#href = link.get('href')     # pulling links of this class
			title = link.contents[1].encode('UTF-8')         # here string is for <href = "something"...> Titlesndjsnd </href>   then it is Titlesndjsnd 
			text.append(str(title))
			#print(title)
			#overs.append(title)
		#	print(href , "\t")
		#	print( get_single_item_data(href) )   # who posted in that page forums 
		
		for link in soup.findAll('div' , {'class' : 'commentary-overs'} ):      # a = find all links of the titles 
			#href = link.get('href')     # pulling links of this class
			title1 = link.string         # here string is for <href = "something"...> Titlesndjsnd </href>   then it is Titlesndjsnd 
			over.append(str(title1))
		

		clean_up_list(text)
def play(url):

    source_code = requests.get(url)  # source code of page
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text, "html.parser")
    soup.encode("UTF-8")
    h, m, s = 0, 0, 0
    count = 0
    for link in soup.findAll("div", {"class": "timestamp"}):

        count += 1
        time = link.string.split(":")
        if len(time) == 3:
            h += int(time[0])
            m += int(time[1])
            s += int(time[2])
        else:
            m += int(time[0])
            s += int(time[1])

    ts = h * 60 * 60 + m * 60 + s
    m, s = divmod(ts, 60)
    h, m = divmod(m, 60)
    d, h = divmod(h, 24)

    print("Total Videos : ", count)
    print(d, "Days", h, "Hours", m, "Minutes", s, "Seconds")
Example #3
0
	def scrape(self, links=[], ads=True, translator=False):
		print "Scraping ad pages..."
		responses = []
		values = {}
		data = []

		urls = self.generate_pages(self.base_url)
		
		for url in urls:
			print "Scraping URL:", url
			r = requests.get(url)
			soup = BeautifulSoup(r.text, "html.parser")
			soup.encode('utf-8')

			values["title"] = self.get_ad_title(soup)
			# values["phone_numbers"]
			values["text_body"] = self.get_ad_text(soup)
			values["images"] = self.get_ad_images(soup)
			values["link"] = url
			values["posted_at"] = self.get_ad_date(soup)
			values["scraped_at"] = str(datetime.datetime.now())
			values["language"] = "Spanish"  # Being lazy here.
			# values["polarity"]
			# values["translated_body"]
			# values["translated_title"]
			# values["subjectivity"]
			time.sleep(3)

		data.append(values)
		return data
Example #4
0
	def generate_pages(self, url):
		"""
		Creates a list of URLs containing ads for further scraping.
		"""
		print "Fetching pages..."
		urls = []
		while True:
			print url
			r = requests.get(url)
			soup = BeautifulSoup(r.text, "html.parser")
			soup.encode('utf-8')

			ad_links = self.get_ad_links(soup)
			for link in ad_links:
				urls.append(link)

			next = soup.find_all("a", { "class": "num_next"})

			if next:
				# Fetches the current page.
				current_page = int(soup.find_all("div", { "class": "num_sel"})[0].text)
				# Generates the link for the next page.
				new_url = str(''.join([url.split("?")[0],"?p=",str(current_page+1)]))
				url = new_url
				time.sleep(3)
			else:
				break
		# Note: List needs to be uniquified.
		return urls
def getHTML():

	url = "https://en.wikipedia.org/wiki/List_of_Super_Bowl_champions"
	html = urllib.request.urlopen(url).read()
	soup = BeautifulSoup(html,'html.parser')
	soup.encode('utf8')
	soup.prettify()
	extract_info(soup)	
Example #6
0
def seating_details(request, lan_id, seating_id=None, seat_id=None):
    lan = get_object_or_404(LAN, pk=lan_id)
    seatings = Seating.objects.filter(lan=lan)

    if not seatings:
        return render(request, 'seating/seating.html')

    if seating_id:
        seating = get_object_or_404(Seating, pk=seating_id, lan=lan)
    else:
        seating = seatings[0]
        return redirect(seating)

    users = seating.get_user_registered()
    seats = seating.get_total_seats()

    dom = BeautifulSoup(seating.layout.template, "html.parser")
    counter = 0
    for tag in dom.find_all('a'):
        children = tag.find_all('rect')
        children[0]['seat-number'] = seats[counter].pk
        children[0]['seat-display'] = seats[counter].placement
        if not seats[counter].user:
            children[0]['class'] = ' seating-node-free'
            children[0]['status'] = "free"
        else:
            if seats[counter].user == request.user:
                children[0]['class'] = ' seating-node-self'
                children[0]['status'] = "mine"
            else:
                children[0]['class'] = ' seating-node-occupied'
                children[0]['status'] = "occupied"
                children[0]['seat-user'] = unicode(seats[counter].user.get_full_name())

                #Separate title element for chrome support
                title = dom.new_tag("title")
                title.string = unicode(seats[counter].user.get_full_name())
                tag.append(title)

        counter += 1
    dom.encode("utf-8")


    context = {}
    context['seatings'] = seatings
    context['seating'] = seating
    context['seat'] = seat_id
    context['hide_sidebar'] = True
    context['template'] = dom.__str__

    return render(request, 'seating/seating.html', context)
Example #7
0
def wp_reformat(content):
    content = BeautifulSoup(content)
    for img in content.find_all('img'):
        src = img['src']
        src = re.sub('http://kecebongsoft.files.wordpress.com/(\d+)/(\d+)/', r'/img/wordpress/\1-\2-', src)
        img.replace_with('![image](%s)' % src)
    #content = re.sub('http://kecebongsoft.files.wordpress.com/(\d+)/(\d+)/', r'/img/wordpress/\1-\2-', content)
    #content = re.sub('<im.*src=(\'|")(.*)["\'].*>', r"![image](\2)", content)

    content = re.sub('\[caption.*?\](.|\n)*?\!(.*?\))(.|\n)*?caption]', r'\2', content.encode('ascii', 'ignore'))

    content = re.sub('\[source.*?\]((.|\n)*?)\[/sourcecode\]', r'\t:::txt\1', content.encode('ascii', 'ignore'))
    

    return content
Example #8
0
def parse(source):
    """Pulls out the paste"""

    soup = BeautifulSoup(source, "html5lib")
    soup.encode("utf8")
    uri = soup.find("title")
    uri = uri.encode("utf-8")
    uri = uri[7:-19] 
    #print uri
    title =  soup.find('div', {'class': 'modal-body'}).get_text().strip()
    newline = title.index('\n')
    title = title[38:newline-1]
    paste = soup.find('div', {'id': 'code'}).get_text().strip()

    return title, paste
Example #9
0
def serverlist(url):
    link = urllib2.urlopen(url).read()
    soup = BeautifulSoup(link.decode("utf-8"))
    epis = soup("p", {"class": "epi"})
    for i in range(0, len(epis)):
        etitle = BeautifulSoup(str(epis[i]))("b")[0].contents[0]
        addDir(etitle.encode("utf-8"), url, 3, iconimage, False, i, gname)
Example #10
0
def doctoText(filepath):

    """
    returns a string of text from the input file. created the if statement
    for future file formats. link below provided partial code.
    http://davidmburke.com/2014/02/04/python-convert-documents-doc-docx-odt-pdf-to-plain-text-without-libreoffice/
    """

    if filepath[-4:] == ".pdf":
        return convertpdftoText(filepath)
    elif filepath[-5:] == ".docx":
        document = opendocx(filepath)
        paratextlist = getdocumenttext(document)
        newparatextlist = []
        for paratext in paratextlist:
            newparatextlist.append(paratext.encode("utf-8"))
        return "\n\n".join(newparatextlist)
    else:
        with open(filepath, "rb") as myfile:
            try:
                # cleans html, removes tags
                htmldata = myfile.read()
                edata = htmldata.decode("utf-8", "strict")
                raw = BeautifulSoup(edata).get_text()
                cleanedhtml = raw.encode("utf-8", "strict")
                return cleanedhtml
            except:
                data = myfile.read()
                return str(data)
def search_weibo(url):
    #url = 'http://s.weibo.com/weibo/%25E5%2591%25A8%25E7%25A5%2589%25E6%2580%2580&Refer=index'
    url = 'http://s.weibo.com/weibo/%E8%91%A3%E5%B4%87%E6%B4%8B&Refer=index'
    #url = 'http://s.weibo.com/weibo/%25E5%25BC%25A0%25E5%25A9%25B7?topnav=1&wvr=6&b=1'
    #url = 'http://www.weibo.com/u/3075975003?from=myfollow_all'
    page = urllib.urlopen(url)
    content = page.read()
    #print content
    transcode = content.decode('utf-8','ignore').encode('utf-8','ignore')
    #print type(content)
    #transcode=content.decode('gbk','ignore').encode('utf-8','ignore') 
    o = BeautifulSoup(transcode)

    #print o
    #print type(o)
    #print dir(o)
    #print o.find_all('p')
    
    #print o.findAll('p')
    #print o.title
    
    f = open('web.txt','w+')
    '''
    for item in o: 
        f.write("%s" % item)
    f.close()
    '''
    open('origin.txt','w+').write(transcode)

    f.write(o.encode('gbk'))
    f.close()
def get_solution(url):
	#url = 'https://community.topcoder.com/stat?c=problem_solution&cr=40440099&rd=16747&pm=14278'
	
	#url = 'https://community.topcoder.com/stat?c=problem_solution&rm=329103&rd=16775&pm=14340&cr=23089515'

	#url = 'https://community.topcoder.com/stat?c=problem_solution&cr=40364957&rd=16747&pm=14278'

	print url

	#tcsso = 'b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584'

	#cookies = dict()
	#cookies['tcsso'] = '40451530|b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584'
	#'40451530|b0be8a6e3acae9d8743c91ada7294a5b65a698b0dfa82cda539d54a7d41e7584'

	#cookies['JSESSIONID'] = 'UYKd7Rv1-OY-6bmewBWJDw**.tomcat_tc01'

	print cookies

	page = requests.get(url, cookies=cookies)
	#print page
	if str(page) == "<Response [503]>":
		while str(page) == "<Response [503]>":
			time.sleep(1)
			page = requests.get(url, cookies=cookies)
	html_content = page.text

	#print html_content[0:100000]

	#soup = BeautifulSoup(html_content, "html.parser")

	#text = soup.select("body > table > tbody > tr > td.bodyText > table.paddingTable > tbody > tr:nth-child(1) > td > table:nth-child(4) > tbody > tr:nth-child(13) > td")

	body = re.findall('<TD CLASS="problemText" COLSPAN="8" VALIGN="middle" class="alignMiddle" ALIGN="left">\n            (.+?)<BR>\n        </TD>', html_content, flags=re.S)

	text = body[0]

	text = text.replace("<BR>","\n")

	#print w

	#print repr(text)
	print text

	failed_to_download = None
	solution = None


	if len(text)==0:
		failed_to_download = solution_id
	else:
		body = BeautifulSoup(str(text), "html.parser").get_text()

		body = body.replace("\\","\\\\")
		solution = body.encode('utf-8').decode('string-escape')

		#print repr(solution)
		#print solution

	return solution
Example #13
0
    def updateMahoyo(self, progress):
        progress = BeautifulSoup(progress)
        exception = self.master.modules["commands"].exception

        user = yield self.config.get("user")
        passwd = yield self.config.get("pass")
        if user is None or passwd is None:
            raise exception(u"No blog username or password in config")

        blog = XMLRPC("http://commiesubs.com/xmlrpc.php")

        post = yield blog.callRemote("wp.getPost", 0, user, passwd, 8367)
        content = BeautifulSoup(post["post_content"])

        old = content.find(class_="progress")
        new = progress.find(class_="progress")
        old.replace_with(new)
        content = content.encode(formatter="html")

        try:
            yield blog.callRemote(
                "wp.editPost", 0, user, passwd, 8367, {"post_content": content}
            )
        except:
            raise exception(u"Couldn't update post")
Example #14
0
    def search_pmc(self, pmc_id):
        """fetch the documents' content (have pmc id)"""
        query = self.baseURL + "efetch.fcgi?db=pmc&id={id}" . format(id=pmc_id)
        self.logger.info("search the content of pmc document based on its 'id'={id}. searching url={url}" . format(id=pmc_id, url=query))
        try:
            content = BS(urlopen(query, timeout=TIMEOUT).read(), 'lxml')
        except:
            content = ""

        if "<?properties open_access?>" in content.encode('utf-8'):
            self.logger.info("access the open_access document. pmc_id={pid}" . format(pid=pmc_id))
            parsed_doc = self.eutils_parse_doc(content)
            return {
                "pmc_id": pmc_id,
                "abstract": parsed_doc[0],
                "body": parsed_doc[1],
                "entry_created_date": str(datetime.datetime.utcnow())
            }
        else:
            return ""
            self.logger.info("cannot access the document (pmc_id={pid}). Crawl it by web crawler!" . format(pid=pmc_id))
            parsed_doc = self.advanced_parse_doc(pmc_id)
            return {
                "pmc_id": pmc_id,
                "abstract": parsed_doc[0],
                "keyword": parsed_doc[1],
                "body": parsed_doc[2],
                "bib": parsed_doc[3],
                "entry_created_date": str(datetime.datetime.utcnow())
            }
Example #15
0
def scrapSatCen():

    print("#========================= SatCen SCRAPING =========================")

    SatCenData = satcen.returnAgency('SATCEN')
    SatCen_link = SatCenData['link'][0]
    SatCen_id = SatCenData['id'][0]
    SatCen_source = urllib.request.urlopen(SatCen_link)

#Retrieve the list of jobs as bs4 navigable string
    soup = BeautifulSoup(SatCen_source,'html.parser')


        #Convert to bytes
    bytesEncoded = soup.encode('utf-8')
#Convert to string
    stringDecoded = bytesEncoded.decode('utf-8')
#Convert to dictionary
    jobsdict = json.loads(stringDecoded)
#Browse dictionaty and select available positions
    for job in jobsdict:
        if (job['Status']=='OPEN') and (job['InternalOnly'] == False):
            link = 'https://apps.satcen.europa.eu/recruitment/#/vacancy/'+job['Id']
            print(job['Reference'], job['ExpireOn'][:10],job['Title'],format.typeOfPost(job['TypePost']),job['WorkUnit'],link)
            satcen.persist(SatCen_id, job['Title'],job['Reference'],job['WorkUnit'],'', job['ExpireOn'][:10],link,'', format.typeOfPost(job['TypePost']))

    print("#========================SATCEN SCRAPING COMPLETE=================================")
Example #16
0
def get_correctedFiles(path, save, url, img):

    if not os.path.exists(save):
        os.makedirs(save)

    for f in os.listdir(path):
        print "correcting file %s" % f
        infile = open(os.path.join(path, f)).read()
        
        soup = BeautifulSoup(infile, "html5lib")
        for tag in soup.find_all(lambda t: 'href' in t.attrs or 'src' in t.attrs):
            if 'href' in tag.attrs:
                url_parts = urlparse.urlsplit(tag.attrs["href"])
                full_path = tag.attrs["href"]
                hrefpath = url_parts.path
                if full_path[0:4] != "http" or full_path[0:5] != " http":
                    # for wiki conversion (moin moin wikis)
                    # hrefpath = hrefpath.replace("/", "|")
                    if hrefpath[0:6] == "|wiki|":
                        hrefpath = hrefpath[6:]
                    tag.attrs["href"] = urlparse.urljoin(url, hrefpath)
            else:
                url_parts = urlparse.urlsplit(tag.attrs["src"])
                srcpath = url_parts.path
                srcparts = srcpath.split("/")
                srcpath = srcparts[len(srcparts) -1]
                tag.attrs["src"] = urlparse.urljoin(img, srcpath)

        
        outfile = open(os.path.join(save, f), "w")
        outfile.write(soup.encode("ascii", "xmlcharrefreplace"))
        outfile.close()
def get_solution(contest, solution_id):
	url = 'http://codeforces.com/contest/' + str(contest[0]) + '/submission/' + str(solution_id)
	
	print url

	page = requests.get(url)
	if str(page) == "<Response [503]>":
		while str(page) == "<Response [503]>":
			time.sleep(1)
			page = requests.get(url)
	html_content = page.text

	#print html_content

	soup = BeautifulSoup(html_content, "html.parser")

	text = soup.select("body > div > div > div > div > pre")

	failed_to_download = None
	solution = None


	if len(text)==0:
		failed_to_download = solution_id
	else:
		body = BeautifulSoup(str(text[0]), "html.parser").get_text()

		body = body.replace("\\","\\\\")
		solution = body.encode('utf-8').decode('string-escape')

	return solution_id, solution, failed_to_download
def get_solution(solution_id):
	#solutions = []
	#failed_to_download_s = []
	#for i in solution_ids:
	url = "https://www.codechef.com/viewplaintext/" + str(solution_id)
	
	page = requests.get(url)
	if str(page) == "<Response [503]>":
		while str(page) == "<Response [503]>":
			time.sleep(1)
			page = requests.get(url)
	html_content = page.text

	if html_content==None:
		failed_to_download_s.append(i)

	text = BeautifulSoup(html_content, "html.parser").get_text()

	#'''figure out if escape_lt needs to go here'''

	print len(text)
	#print text


	failed_to_download = None
	solution = None

	#print text
	if len(text)==0 or re.search('var _sf_startpt = (new Date()).getTime()', text) != None:
		failed_to_download = solution_id
	else:
		text = text.replace("\\","\\\\")
		solution = text.encode('utf-8').decode('string-escape')

	return solution_id, solution, failed_to_download
Example #19
0
def formatabbr(page):
    srd = SoupStrainer('div', id='container')
    div = BeautifulSoup(page, parse_only=srd).div
    nav = div.find('div', id='navigation')
    nav.decompose()
    tbl = div.find('table')
    tbl.name = 'div'
    tbl.attrs.clear()
    tbl['class'] = 'oH1'
    tdr = div.find_all(name=re.compile(r't[dr]', re.I))
    for t in tdr:
        t.unwrap()
    for p in div.find_all('p'):
        p['class'] = 'ZFY'
        p.name = 'div'
    blank = div.find('div', class_='blank')
    if blank:
        blank.decompose()
    ft = div.find('div', id='footer')
    if ft:
        ft.decompose()
    formatcontent(div)
    div.attrs.clear()
    div['class'] = 'RmY'
    text = cleansp(div.encode('iso-8859-1'))
    div.decompose()
    return ''.join(['<link rel="stylesheet"href="ety.css"type="text/css">', text])
def serverlist(url):
    link = urllib2.urlopen(url).read()
    soup = BeautifulSoup(link.decode('utf-8'))
    epis = soup('p',{'class':'epi'})
    for i in range(0,len(epis)):
        etitle = BeautifulSoup(str(epis[i]))('b')[0].contents[0]
        addDir(etitle.encode('utf-8'),url,3,iconimage,False,i)
Example #21
0
 def handle_html_content(self, content):
     soup = BeautifulSoup(content, 'html.parser')
     for p_elem in soup.find_all('p'):
         css = None
         if 'style' in p_elem.attrs:
             css = cssutils.parseStyle(p_elem.attrs['style'])
         text_list = p_elem.text.split()
         p_new = soup.new_tag('p', style=css.cssText if css else None)
         for idx, word in enumerate(text_list):
             if len(self.dorks) <= 0:
                 self.dorks = yield from self.get_dorks()
             word += ' '
             if idx % 5 == 0:
                 a_tag = soup.new_tag(
                     'a',
                     href=self.dorks.pop(),
                     style='color:{color};text-decoration:none;cursor:text;'.format(
                         color=css.color if css and 'color' in css.keys() else '#000000'
                     )
                 )
                 a_tag.string = word
                 p_new.append(a_tag)
             else:
                 p_new.append(soup.new_string(word))
         p_elem.replace_with(p_new)
     content = soup.encode('utf-8')
     return content
Example #22
0
def makeappdx(page):
    srd = SoupStrainer('div', id='container')
    div = BeautifulSoup(page, parse_only=srd).div
    nav = div.find('div', id='navigation')
    nav.decompose()
    title = div.center.get_text(strip=True)
    div.center.decompose()
    font = div.find_all('font', size='2', color=None)
    for f in font:
        f.unwrap()
    for p in div.find_all('p'):
        p['class'] = 'ZFY'
        p.name = 'div'
    blank = div.find('div', class_='blank')
    if blank:
        blank.decompose()
    ft = div.find('div', id='footer')
    if ft:
        ft.decompose()
    div.attrs.clear()
    div['class'] = 'oH1'
    formatcontent(div)
    text = cleansp(div.encode('iso-8859-1'))
    div.decompose()
    return ''.join(['<div class="xsv">', title, '</div>', text])
def text_cleaner(stripped_advertiser, website):  # clean up html and read words
    soup_obj = BeautifulSoup(website, "html.parser")
    for script in soup_obj(["script", "style"]):  # get rid of that nasty
        # javascript
        script.extract()  # Remove these two elements from the BS4 object
    text = soup_obj.get_text().replace(".", "")  # we convert text in place
    # several times coming up
    with open('sites/{}.html'.format(stripped_advertiser.lower()), 'w') as f:
        f.write(soup_obj.encode("utf-8"))  # write html out to file
    lines = (line.strip() for line in text.splitlines())  # break into lines
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

    def chunk_space(chunk):  # individual utf-8 encoded word chunks
        chunk_out = chunk + " "
        return chunk_out
    text = "".join(chunk_space(chunk) for chunk in chunks if chunk).encode(
        "utf-8")
    try:
        text = text.decode("unicode_escape").encode("ascii", "ignore")
    except Exception as error:
        return  # just in case there are some weird characters here, don't kill
    text = re.sub("[^a-zA-Z.+3]", " ", text)  # only regular words, no numbers
    text = text.lower().split()  # lower case so the dict can be populated
    stop_words = set(stopwords.words("english"))
    text = [w for w in text if w not in stop_words]  # gets rid of stop words
    text = list(set(text))  # no repeats!
    for word in text:
        if word not in fashion_dict:
            fashion_dict[word] = 1  # populate fashion dict
        else:
            fashion_dict[word] += 1  # increment entry
    return text
def downloadthread(fil):
    apps_links = read_from_file(fil)
    for link in apps_links[1:3]:
    #    link = apps_links[1] # dev version of for loop
        req = urllib2.Request("%s%s" % (base_url,link), None, headers)
        html_doc = urllib2.urlopen(req).read()
        ################        Title extract
        startstring = '<title>'
        start = html_doc.find(startstring)+len(startstring)
        endstring='| AppBrain Android Market</title>'
        end = html_doc.find(endstring)
        Title = html_doc[start:end]        
        ################        Description extract
        startstring = '<div class="app_descriptiontab">'
        start = html_doc.find(startstring)+len(startstring)
        endstring='<div style="position: absolute; right: 0px; bottom: 0px">'
        end = html_doc.find(endstring)
        description = html_doc[start:end]
        ################ 
        description = description.strip() # get rid of whitespace
        description = BeautifulSoup(description)
        description = description.get_text() # get rid of html
        subdir = "\\fulldescriptions\\" + fil[1:fil.find(".")]+"\\"
        path = basedir+subdir
        filename = path+Title+".txt"
        if not os.path.exists(path): # if folder does not exist create it.
            os.makedirs(path)
        with open(filename, "w") as txtfile:
            txtfile.write(description.encode('utf8'))
        print("link:%s done" % link)
def serializeLabContent(labContent):
	#print labContent
	f = open('template.html', "r")
	labHtml = f.read()
	f.close()
	labTemplate = BeautifulSoup(labHtml)
	articleSection = labTemplate.find_all('div', id="experiment-article-sections")[0]
	sectionNumber = 1
	for sectionName,sectionContent in labContent:
		sectionTag = labTemplate.new_tag('section', id="experiment-article-section-"+str(sectionNumber))
		articleSection.append(sectionTag)
		iconTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-icon")
		iconTag['class']='icon'
		sectionTag.append(iconTag)
		headingTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-heading")
		headingTag['class']='heading'
		headingTag.append(sectionName)
		sectionTag.append(headingTag)
		contentTag = labTemplate.new_tag('div', id="experiment-article-section-"+str(sectionNumber)+"-content")
		contentTag['class']='content'
		contentTag.append(sectionContent)
		sectionTag.append(contentTag)
		sectionNumber +=1	
	f = open('content.html', "w+")
	labTemplate = labTemplate.prettify()
	f.write(labTemplate.encode('utf-8'))
	f.close()
def extractCourseCatalog(School):
	#accesses the url & takes html
	url = urllib2.urlopen(School.url)
	html = url.read()
	#parsing html web content
	txt_content = BeautifulSoup(html, 'html.parser').get_text()
	#creating a file, converting text to unicode, saving file
	txt_name = School.name + '.txt'
	txt_file = open(txt_name, 'w')
	encoded_content = txt_content.encode('utf-8')
	txt_file.write(encoded_content)
	txt_file.close()

	#use regex to find courses, remove duplicates and sort
	course_names = re.findall('COMP [0-5]\d*', html)
	uniq_names = list(set(course_names))
	uniq_names.sort()

	if (School.name == 'Carleton'):
		getCourseDescriptions(School, uniq_names, txt_content)

	else:
		getCourseDescriptions(School, uniq_names, encoded_content)

	names_file = open(School.name + '_courses.txt', 'w')

	# put each course in school_courses file exactly once
	for course in (uniq_names):
		names_file.write(course + '\n')

	names_file.close()
Example #27
0
def getNewestData():
	file = open("Data/vextir.txt", "w");
	soup = BeautifulSoup(urllib2.urlopen("http://www.landsbankinn.is/Vextir").read())
	file.write(soup.encode("utf-8"))
	file.close

	return soup
Example #28
0
def downloadCourse(session, c, sem):
    global files
    global sections
    files = itertools.count()
    sections = itertools.count()
    name = c['key'].replace('/', '-') + u'/'
    path = root + sem.replace('/', '-') + u'/' + name
    path = urllib.url2pathname(path.encode('utf-8')).replace(':', '-').replace('"', '')
    if not os.path.exists(path):
        os.makedirs(path)
    print '       +--' + colors.BOLD + name + colors.ENDC
    r = session.get(c['url'])
    if(r.status_code == 200):
        soup = BeautifulSoup(r.text, 'html.parser')
        if not os.path.exists(path + '.dump'):
            os.makedirs(path + '.dump')

        dst = path + '.dump/' + c['key'].replace('/', '-').encode('utf-8') + '-' + c['type'] + '-' + str(datetime.date.today()) + '-full.html'
        dst = dst.replace(':', '-').replace('"', '')
        
        with open(dst, 'wb') as f:
            f.write(soup.encode('utf-8'))
        for s in soup.find_all(class_='section main clearfix'):
            downloadSection(session, s, path)
        #print 'Saved ' + str(files.next()) + ' Files in ' + str(sections.next()) + ' Sections'
    else:
        print 'ERROR: ' + str(r.status) + ' ' + r.reason
        sys.exit()
Example #29
0
    def __get_item_description(self, soup, path):
        pattern = re.compile(r'//dsc\.taobaocdn\.com.*?,')
        scripts = soup.find_all("script")
        for script in scripts:
            if len(script.contents) == 0:
                continue
            match = pattern.findall(script.contents[0])
            if len(match) != 1:
                continue
            description_url = match[0].split(":")[0][:-2]
            # description_url = description_url.replace('"', '')
            # description_url = description_url.replace(' ', '')

            protocol = 'http:'
            description = requests.get(protocol + description_url)
            description = description.text
            pattern = re.compile(r'<.*>')
            match = pattern.findall(description)
            s = BeautifulSoup(match[0], 'html.parser')
            img_list = s.find_all('img')

            # 创建描述图片存放目录
            if not os.path.exists(path):
                os.makedirs(path)
            if len(img_list) != 0:
                self.__download_and_replace(img_list, path)
            return str(s.encode('utf-8').replace('\n', ''))
        raise Exception(ERROR_DESCRIPTION_URL)
Example #30
0
def crawlUrl(url):
    print url

    global i
    global fDict
    global fList

    print len(fDict)
    print len(fList)
    print len(visited)
    try:
        response=urllib2.urlopen(url)
        visited[url]=True
        header=response.info()
        type=header.getheader('content-type')
        if 'text/html' in type:
            content = response.read()
            #print content
            soup = BeautifulSoup(content)
            data = soup.encode('utf8').lower()
            if (('world war' in data)|('stalingrad' in data)):
                hList=getLinks(url,soup)
                updateFile(soup,url,hList,header,content)
                i+=1
                print 'i is',i
            else:
                pass

    except:
        print 'Unable to open URL',url
        pass
Example #31
0
def strip_sphinx_documentation(source_dir, generated_dir, lang_destination_dir, lang, version):
    # Go through each file, and if it is a .html, extract the .document object
    #   contents
    for subdir, dirs, all_files in os.walk(generated_dir):
        for file in all_files:
            subpath = os.path.join(subdir, file)[len(
                generated_dir):]

            if not subpath.startswith('/.') and not subpath.startswith(
                '/_static') and not subpath.startswith('/_doctrees'):
                new_path = lang_destination_dir + subpath

                if '.html' in file or '_images' in subpath or '.txt' in file or '.json' in file:
                    if not os.path.exists(os.path.dirname(new_path)):
                        os.makedirs(os.path.dirname(new_path))

                if '.html' in file:
                    # Soup the body of the HTML file.
                    # Check if this HTML was generated from Markdown
                    original_md_path = get_original_markdown_path(
                        source_dir, subpath[1:])

                    if original_md_path:
                        # If this html file was generated from Sphinx MD, we need to regenerate it using python's
                        # MD library.  Sphinx MD library is limited and doesn't support tables
                        markdown_file(original_md_path, version, '', new_path)

                        # Since we are ignoring SPHINX's generated HTML for MD files (and generating HTML using
                        # python's MD library), we must fix any image links that starts with 'src/'.
                        image_subpath = None

                        parent_paths = subpath.split('/')
                        if '' in parent_paths:
                            parent_paths.remove('')

                        image_subpath = ''

                        # -1 because we nest it 1 further levels? No idea.
                        for i in range(len(parent_paths) - 1):
                            image_subpath = image_subpath + '../'

                        # hardcode the sphinx '_images' dir
                        image_subpath += '_images'

                        with open(new_path) as original_html_file:
                            soup = BeautifulSoup(original_html_file, 'lxml')

                            prepare_internal_urls(soup, lang, version)

                            image_links = soup.find_all(
                                'img', src=re.compile(r'^(?!http).*'))

                            if len(image_links) > 0:
                                for image_link in image_links:
                                    image_file_name = os.path.basename(
                                        image_link['src'])

                                    if image_subpath:
                                        image_link['src'] = '%s/%s' % (
                                            image_subpath, image_file_name)
                                    else:
                                        image_link['src'] = '_images/%s' % (
                                            image_file_name)

                                with open(new_path, 'w') as new_html_partial:
                                    new_html_partial.write(soup.encode("utf-8"))
                    else:
                        with open(os.path.join(subdir, file)) as original_html_file:
                            soup = BeautifulSoup(original_html_file, 'lxml')

                        document = None
                        # Find the .document element.
                        if version == '0.9.0':
                            document = soup.select('div.body')[0]
                        else:
                            document = soup.select('div.document')[0]

                        with open(new_path, 'w') as new_html_partial:
                            new_html_partial.write(document.encode("utf-8"))

                elif '_images' in subpath or '.txt' in file or '.json' in file:
                    # Copy to images directory.
                    copyfile(os.path.join(subdir, file), new_path)
Example #32
0
def zdf2pdf(entries, opts):
    from bs4 import BeautifulSoup
    import urllib, urlparse
    import xhtml2pdf.pisa as pisa
    try:
        import cStringIO as SIO
    except ImportError:
        import StringIO as SIO

    # Save the current directory so we can go back once done
    startdir = os.getcwd()

    # Start the xhtml to be converted
    data = '<head>\n'

    # Normalize all of the given paths to absolute paths
    opts['output_file'] = os.path.abspath(opts['output_file'])
    opts['work_dir'] = os.path.abspath(opts['work_dir'])
    attach_dir = os.path.join(opts['work_dir'], 'attach')

    # Check for and create working directory
    if not os.path.isdir(opts['work_dir']):
        os.makedirs(opts['work_dir'])

    # Check for and create a directory for attachments and images
    if not os.path.isdir(attach_dir):
        os.makedirs(attach_dir)

    # Save the running configuration for rerunning
    parser = configparser.SafeConfigParser()
    config_opts = dict(
        (k, v) for k, v in opts.iteritems()
        if (k != 'json_file' and k != 'categories' and k != 'forums'
            and k != 'topics' and k != 'run_section' and k != 'list_zdf' and
            k != 'work_dir' and k != 'delete' and k != 'url' and k != 'mail'
            and k != 'password' and k != 'is_token' and v != None))
    if config_opts.has_key('style_file'):
        config_opts['style_file'] = os.path.basename(config_opts['style_file'])
    if config_opts.has_key('output_file'):
        config_opts['output_file'] = os.path.basename(
            config_opts['output_file'])
    config_opts['json_file'] = 'entries.json'

    parser.add_section('zdf2pdf')
    for k, v in config_opts.iteritems():
        parser.set('zdf2pdf', k, unicode(v))
    with codecs.open(os.path.join(opts['work_dir'], 'zdf2pdf.cfg'), 'w',
                     'utf-8') as config_file:
        parser.write(config_file)

    if opts['style_file']:
        # Save the style file in the working directory
        shutil.copy(opts['style_file'], opts['work_dir'])
        data += """<link rel="stylesheet" type="text/css"
                   href="{}" />\n""".format(
            os.path.basename(opts['style_file']))

    data += '</head>\n<body>\n'

    # Add PDF header if given
    if opts['header']:
        data += opts['header'] + '\n'

    if opts['footer']:
        data += opts['footer'] + '\n'

    # Build anything provided that should go on the title page
    if opts['title'] or opts['author'] or opts['date'] or opts['copyright']:
        if opts['title_class']:
            title_class = ' class="{}"'.format(opts['title_class'])
        else:
            title_class = ''

        data += '<div{}>\n'.format(title_class)

        if opts['title']:
            data += '<h1>{}</h1>\n'.format(opts['title'])

        if opts['author']:
            data += '<div>{}</div>\n'.format(opts['author'])

        if opts['date']:
            data += '<div>{}</div>\n'.format(opts['date'])

        if opts['copyright']:
            data += '<div>{}</div>\n'.format(opts['copyright'])

        data += '</div>\n'

    # Go through the JSON and build a toc and body to add to the html data
    entry_ids, body, toc = process_entries(entries)

    # Put all of the body after the table of contents
    if opts['toc']:
        if opts['toc_class']:
            toc_class = ' class="{}"'.format(opts['toc_class'])
        else:
            toc_class = ''
        data += '<div{}>\n<h2>{}</h2>\n<ol>\n'.format(toc_class,
                                                      opts['toc_title'])
        data += toc
        data += '</ol>\n</div>\n'
    data += body

    # Change to working directory to begin file output
    os.chdir(opts['work_dir'])

    # Save entries
    with open('entries.json', "w") as outfile:
        outfile.write(json.dumps(entries))

    # Make the data a traversable beautifulsoup
    soup = BeautifulSoup(data)

    if opts['pre_width']:
        # Monkey patch TextWrapper for splitting on any whitespace and add
        # splitting on commas. Save the old one for when we're done.
        old_wordsep_simple_re = textwrap.TextWrapper.wordsep_simple_re
        new_wordsep_simple_re = re.compile(r'(\s+|\,)')
        textwrap.TextWrapper.wordsep_simple_re = new_wordsep_simple_re

        w = textwrap.TextWrapper(width=opts['pre_width'],
                                 replace_whitespace=False,
                                 drop_whitespace=False,
                                 break_on_hyphens=False,
                                 break_long_words=True)
        for pre in soup.find_all('pre'):
            pre_str = ''
            try:
                for line in pre.string.splitlines():
                    pre_str += '\n'.join(w.wrap(line)) + '\n'
                pre.string = pre_str
            except AttributeError:
                # pre tag has no content
                pass

        # Put the original wordsep_simple_re back
        textwrap.TextWrapper.wordsep_simple_re = old_wordsep_simple_re

    # Get images and display them inline
    for img in soup.find_all('img'):
        # Handle relative and absolute img src
        src = urlparse.urljoin(opts['url'], img['src'])

        # Normalize the local filename
        srcfile = os.path.join(attach_dir, src.replace('/', '_'))

        # Get this image if not already present
        if not os.path.isfile(srcfile):
            urllib.urlretrieve(src, srcfile)

        # Update the tag for the relative filepath
        img['src'] = srcfile

    # Make relative links to entries and absolute links to entries point to PDF
    # anchors. e.g.
    # http://example.zendes.com/entries/21473796-title
    # /entries/21473796-title
    # TODO /entries/21473796-title#anchor
    r = re.compile('(?:' + opts['url'] + ')?/entries/([0-9]*)-.*')
    for a in soup.find_all('a'):
        try:
            m = r.match(a['href'])
            # modify the link if we have a match and the entry is in the PDF
            if m and int(m.group(1)) in entry_ids:
                a['href'] = '#{}'.format(m.group(1))
        except KeyError:
            # this a tag doesn't have an href. named anchor only?
            pass

    if opts['strip_empty']:
        soup = strip_empty_tags(soup)

    html = soup.encode('utf-8')

    # Save generated html
    with open('entries.html', "w") as outfile:
        outfile.write(html)

    pdf = pisa.CreatePDF(SIO.StringIO(html),
                         file(opts['output_file'], "wb"),
                         encoding='utf-8')

    if pdf.err and pdf.log:
        for mode, line, msg, code in pdf.log:
            print "%s in line %d: %s" % (mode, line, msg)

    if pdf.warn:
        print "*** %d WARNINGS OCCURED" % pdf.warn

    os.chdir(startdir)
Example #33
0
    soup = BeautifulSoup(page.content, 'html.parser')

    tr_elements = soup.find_all("tr", class_='ranking-list')
    j = 0
    for tr in tr_elements:

        link = tr.find("td", class_='title al va-t word-break').find('a',
                                                                     href=True)

        title = tr.find("div", class_='di-ib clearfix').find('a').text
        print(title)
        link = link['href']
        newURL = link + "/characters"
        newPage = requests.get(newURL)
        newSoup = BeautifulSoup(newPage.content, 'html.parser')
        soupStr = str(newSoup.encode('utf-8'))

        index1 = (soupStr).rindex("<h2")
        index2 = soupStr.rindex("</td")
        soupStr = soupStr[:index1] + soupStr[index2:]
        newSoup = BeautifulSoup(soupStr, 'html.parser')
        tables = newSoup.find('table').find('td',
                                            valign="top",
                                            style='padding-left: 5px;')
        rows = tables.find_all('tr')
        for row in rows:
            std = row.find('td', align="right")
            for names in std.find_all('tr'):
                subStd = names.find('td')
                if (subStd.find('small').text == 'Japanese'):
                    name = subStd.find('a').text
Example #34
0
    def parse(self, response):
        base_url = 'http://sh.lianjia.com'
        items = []
        res = response.body
        soup = BeautifulSoup(res, 'html.parser')
        soup.encode('utf-8')
        for fang in soup.select('.info-panel'):
            item = FangSpiderItem()
            item['fang_key'] = fang.select('h2')[0].a['key'].strip()
            item['fang_desc'] = fang.select('h2')[0].text.strip()
            item['fang_url'] = base_url + fang.select(
                'h2')[0].a['href'].strip()
            item['price'] = fang.select('.price')[0].text.strip()
            item['price_pre'] = fang.select('.price-pre')[0].text.strip()
            item['xiaoqu'] = fang.select('.where')[0].a.text.strip()
            item['huxing'] = fang.select('.where')[0].contents[3].text.strip()
            item['mianji'] = fang.select('.where')[0].contents[5].text.strip()
            item['bankuai'] = ''
            item['chaoxiang'] = ''
            item['age'] = ''
            item['subway'] = ''
            item['taxfree'] = ''
            item['haskey'] = ''
            item['col_look'] = ''
            item['quyu'] = fang.select('.con')[0].contents[1].text.strip()
            #item['bankuai']=fang.select('.con')[0].contents[3].text.strip()
            if len(fang.select('.con')[0].contents) >= 4:
                item['louceng'] = fang.select(
                    '.con')[0].contents[4].string.strip()
            if len(fang.select('.con')[0].contents) >= 6:
                item['chaoxiang'] = fang.select(
                    '.con')[0].contents[6].string.strip()
            if len(fang.select('.con')[0].contents) >= 8:
                item['age'] = fang.select('.con')[0].contents[8].string.strip()
            if len(fang.select('.con')[0].contents) > 9:
                item['age'] = fang.select(
                    '.con')[0].contents[-1].string.strip()
            if len(fang.select('.fang-subway-ex')) > 0:
                item['subway'] = fang.select('.fang-subway-ex')[0].text.strip()
            if len(fang.select('.taxfree-ex')) > 0:
                item['taxfree'] = fang.select('.taxfree-ex')[0].text.strip()
            if len(fang.select('.haskey-ex')) > 0:
                item['haskey'] = fang.select('.haskey-ex')[0].text.strip()
            if len(fang.select('.square')) > 0:
                item['col_look'] = fang.select('.square')[0].span.text.strip()
            #print u'在售:', u'房源编号:',item['fang_key'],u'房源描述:',item['fang_desc'],\
            #  u'区域:',item['quyu'],u'版块:',item['bankuai'], u'楼层:',item['louceng'],u'朝向:',item['chaoxiang'],u'房龄:',item['age'],\
            #  u'小区:',item['xiaoqu'],u'户型 :', item['huxing'],u'面积:',item['mianji'],\
            #  u'总价:',item['price'],u'单价:',item['price_pre'],u'看房人数:',item['col_look']#,u'房源链接:',item['fang_url']#\
            #  #u'交通 :',item['subway'],u'税费:',item['taxfree'],u'钥匙:',item['haskey'],u'房源链接:',item['fang_url']
            #print u'在售:',  item['fang_key'],item['fang_desc'],\
            #   item['quyu'] ,item['louceng'], item['chaoxiang'], item['age'],\
            #   item['xiaoqu'] , item['huxing'],item['mianji'],\
            #   item['price'],item['price_pre'],item['col_look']#,u'房源链接:',item['fang_url']#\
            #u'交通 :',item['subway'],u'税费:',item['taxfree'],u'钥匙:',item['haskey'],u'房源链接:',item['fang_url']
            items.append(item)
        #for item in items:
        #     print u'在售:', u'房源编号:',item['fang_key'],u'房源描述:',item['fang_desc'],u'房源链接:',item['fang_url'],\
        #         u'区域:',item['quyu'],u'版块:',item['bankuai'], u'楼层:',item['louceng'],u'朝向:',item['chaoxiang'],u'房龄:',item['age'],\
        #         u'小区:',item['xiaoqu'],u'户型 :', item['huxing'],u'面积:',item['mianji'],\
        #         u'总价 :',item['price'],u'单价:',item['price_pre'],u'看房人数:',item['col_look'],\
        #         u'交通 :',item['subway'],u'税费:',item['taxfree'],u'钥匙:',item['haskey']

        return items
            continue
        soup = ''
        HTMLFILE = str(line[1]) + '.htm'
        TEXTFILE = str(line[1]) + '.txt'
        HADOOP_HTMLFILE = 'user/root/crawls/' + str(ANET) + '/' + str(
            BNET) + '/' + HTMLFILE
        HADOOP_TEXTFILE = 'user/root/texts/' + str(ANET) + '/' + str(
            BNET) + '/' + TEXTFILE
        print "-======= site: " + str(url) + " =======-"
        try:
            soup = BeautifulSoup(html)
        except:
            print " soup exception"
            continue
        HFP = open(HTMLFILE, 'w')
        HFP.write(soup.encode('utf-8'))
        HFP.close()
        with open(HTMLFILE) as hfp:
            try:
                client.create_file(HADOOP_HTMLFILE, hfp)
            except:
                client.delete_file_dir(HADOOP_HTMLFILE)
                client.create_file(HADOOP_HTMLFILE, hfp)

        TFP = open(TEXTFILE, 'w')
        WRITEOUT = unicode(soup.get_text())
        WORDLIST = re.sub(r'[^a-zA-Z0-9 ]', r' ', WRITEOUT)
        WORDLIST = WORDLIST.strip().split()
        TFP.write(WRITEOUT.encode('utf-8'))
        TFP.close()
        PAGETITLE = ''
Example #36
0
    def link_modifier(self, search_string, page_location):
        """
        Find a word and link it to the page location. Add to
        list of updated items
        :param search_string: string to be linked
        :param page_location: string of the name of linked page
        :return:
        """
        for response in self.response['results']:
            if response['type'] != 'page':
                continue

            # copy the response
            response_copy = {
                'id': response['id'],
                'type': response['type'],
                'title': response['title'],
                'version': {},
                'body': {}
            }
            response_copy['body']['storage'] = {}
            response_copy['body']['storage']['representation'] = response[
                'body']['storage']['representation']
            response_copy['body']['storage']['value'] = response['body'][
                'storage']['value']
            response_copy['version'][
                'number'] = response['version']['number'] + 1
            response_body = response_copy['body']['storage']['value']

            bs = BeautifulSoup(response_body, "html.parser")
            matches = bs.findAll(text=re.compile(r'\b' + search_string +
                                                 r'\b'))

            if not matches:
                return

            change_count = 0
            for match in matches:
                grand_parent = match.parent.parent.name

                # check if word is part of a markdown
                if "ac:" in grand_parent:
                    if grand_parent == "ac:link":
                        try:
                            existing_link = match.parent.previous_sibling[
                                'ri:content-title']
                        except:
                            print "Error: detected self referencing link at: {}"\
                                .format(response['title'])
                            continue
                        if existing_link != page_location:
                            match.parent.previous_sibling[
                                'ri:content-title'] = page_location
                            change_count += 1
                        else:
                            continue
                    else:
                        continue
                else:
                    # don't add links in tables
                    # for parent in match.parents:
                    #     if "table" in parent:
                    #         continue
                    substituted = re.sub(
                        r'\b' + search_string + r'\b',
                        self.LINK1 + page_location + self.LINK2 +
                        search_string + self.LINK3, match)
                    match.replaceWith(BeautifulSoup(substituted,
                                                    "html.parser"))
                    change_count += 1

            if change_count:
                # do replacement
                response_copy['body']['storage']['value'] = bs.encode('utf-8')
                self.to_be_updated.append(response_copy)
                self.responses.append(response)
            else:
                continue
Example #37
0


#request get html and find specific content in every 5seconds
import time
url ='https://weather.yahoo.co.jp/weather/jp/13/4410.html'
#url = input("enter url to request :")
flag = True
counter = 0
while flag :
    result = get_html(url)
    if result :
        counter+=1
        soup = BeautifulSoup(result,'html.parser')
        print(soup)
        soupText = str(soup.encode('utf-8'))
        f = open('result.txt','w')
        f.write(soupText)

        #access to div class attr way1
        divs = soup.findAll('div',{'class':'forecastCity'})
        print(id(divs))

        #access to div class attr way2
        divs2 = soup.select('div[class*=forecast]')
        print(id(divs2))
        time.sleep(10)



    def crawl_episode_info(self):
        mdir =os.path.dirname(os.path.abspath(__file__))
        fname = mdir+'/feed.xml'
        url = 'http://dataskeptic.com/feed.rss'
        if not(os.path.isfile(fname)):
            print('EP:fetching')
            r = requests.get(url)
            f = open(fname, 'wb')
            f.write(r.text.encode('utf-8'))
            f.close()
        with open(fname) as fd:
            xml = xmltodict.parse(fd.read())
        episodes = xml['rss']['channel']['item']
        descriptions = []
        descToTitle = {}
        descToLink = {}
        descToNum = {}
        l = len(episodes)
        for episode in episodes:
            enclosure = episode['enclosure']
            desc = episode['description']
            desc = desc.replace(u'\xa0', u' ')
            desc = desc.replace(u'\n', u' ')
            desc = desc.replace(u'\xc2', u' ')
            desc = BeautifulSoup(desc, "lxml").text
            if len(desc) >= 5:
                descriptions.append(desc)
                descToTitle[desc] = episode['title']
                descToLink[desc] = episode['link']
                descToNum[desc] = l
                l = l - 1
        result = {}
        for desc in descriptions:
            info = {}
            info["link"] = descToLink[desc]
            info["title"] = descToTitle[desc]
            info["num"] = descToNum[desc]
            result[desc] = info
        mdir = os.path.dirname(os.path.abspath(__file__))

        if not os.path.exists(mdir+'/text/'):
            os.makedirs(mdir+'/text/')
        with open(mdir+'/text/episodes_json.txt', 'w') as outfile:  
            json.dump(result, outfile)

        with open(mdir+'/text/episode_titles.txt','w') as thefile:
            for i in range(len(descriptions)):
                desc = descriptions[i]
                title = descToTitle[desc]
                title = title.replace('[MINI]', "")
                title = title.encode('utf-8').strip()
                title = str(title).replace('\n', "") 
                thefile.write("%s\n" % str(title))

        with open(mdir+'/text/episode_descs_titles.txt', 'w') as thefile:  
            for i in range(len(descriptions)):
                desc = descriptions[i]
                title = descToTitle[desc]
                desc = desc.encode('utf-8').strip()
                desc = str(desc).replace('\n', "") 
                title = title.replace('[MINI]', "")
                title = title.encode('utf-8').strip()
                title = str(title).replace('\n', "") 
                thefile.write("%s\n" % str(title+", "+desc)) 
        self.descriptions = descriptions
def get_descriptions(problem):
    descriptions = []
    left_out = []
    failed_to_download_d = []
    #print problem_list
    #for i in problem_list:

    url = 'https://www.hackerearth.com/problem/algorithm/' + problem
    #url = 'https://www.hackerearth.com/problem/algorithm/' + i
    #url = 'https://www.hackerearth.com/problem/algorithm/' + 'ways-of-seeing-circuits'
    #print url
    #url = "https://www.codechef.com/api/contests/PRACTICE/problems/" + str(i)

    print url

    page = requests.get(url)

    if str(page) == "<Response [503]>":
        while str(page) == "<Response [503]>":
            time.sleep(1)
            page = requests.get(url)

    html_content_all = page.text

    if re.search('"message":"requests limit exhausted"',
                 html_content_all) != None:
        while re.search('message":"requests limit exhausted',
                        html_content_all) != None:
            time.sleep(1)
            page = requests.get(url)
            html_content_all = page.text

    if html_content_all == None:
        failed_to_download_d.append(i)

    #print html_content_all

    soup = BeautifulSoup(html_content_all)
    #html_content = soup.findAll("div", { "class" : "starwars-lab" })
    #'''
    html_content_1 = soup.findAll("div", {"class": "starwars-lab"})
    html_content_2 = soup.findAll(
        "div", {"class": "less-margin-2 input-output-container"})
    html_content_3 = soup.findAll("div", {"class": "standard-margin"})
    #'''

    #raw = BeautifulSoup(str(html_content[0]).replace("</p>", "\n</p>").replace("<sup>", "<sup>^").replace("\le", u"≤").replace("\ge", u"≥").replace("\lt", "<").replace("\gt", ">"), "html.parser").get_text()
    raw = BeautifulSoup(
        str(html_content_1[0]).replace("</p>", "\n</p>").replace(
            "<sup>", "<sup>^"), "html.parser").get_text() + BeautifulSoup(
                str(html_content_2[0]).replace("</p>", "\n</p>").replace(
                    "<sup>", "<sup>^"),
                "html.parser").get_text() + BeautifulSoup(
                    str(html_content_3[0]).replace("</p>", "\n</p>").replace(
                        "<sup>", "<sup>^"), "html.parser").get_text()

    #if re.search("https://d320jcjashajb2.cloudfront.net/media/uploads", str(html_content_all)) == None and re.search('"message":"Problem is not visible now. Please try again later."', str(html_content_all)) == None and re.search('Statement is not available', str(html_content_all)) == None:
    if re.search(
            "https://d320jcjashajb2.cloudfront.net/media/uploads",
            html_content_all
    ) == None and re.search(
            '"message":"Problem is not visible now. Please try again later."',
            html_content_all) == None and re.search(
                'Statement is not available', html_content_all) == None:
        raw = raw.replace("\n\n\n\n\n\n", "")
        #raw = raw.replace("\n\n\n\n\n", "\n")

        raw = raw.replace("\n\n\n", "\n")
        raw = raw.replace("\n\n\n", "\n\n")

        raw = raw.replace("\n\n\n", "\n\n")

        raw = raw.replace("<sup>", "<sup>^")

        raw = raw.replace("\in", u"∈").replace('$$', '')

        raw = raw.replace(" <=", u" ≤").replace(" >=", u" ≥").replace(
            "<=", u" ≤ ").replace(">=", u" ≥ ").replace(u"≤  ", u"≤ ").replace(
                u"≥  ", u"≥ ").replace("\le",
                                       u"≤").replace("\ge", u"≥").replace(
                                           "\lt", "<").replace("\gt", ">")

        raw = re.sub('Subtasks(.+?)SAMPLE INPUT',
                     'SAMPLE INPUT',
                     raw,
                     flags=re.S)

        raw = re.sub('Time Limit:(.+)', '', raw, flags=re.S)

        raw = re.sub('See Russian translation\n\n', '', raw, flags=re.S)
        raw = re.sub('See Russian translation', '', raw, flags=re.S)

        raw = raw.replace("\\", "\\\\")

        descriptions.append(raw.encode('utf-8').decode('string-escape'))

    else:
        #left_out.append(i)
        #descriptions.append(raw.encode('utf-8').decode('string-escape'))
        left_out.append(problem)

        #hjgf

    #print 'descriptions'
    #print descriptions[0]

    #asasdf

    return descriptions, left_out, failed_to_download_d
Example #40
0
    )
    meta2["name"] = "viewport"
    headTag.append(meta2)
    content.body.insert_before(headTag)

    return content


def changeImgSrcAttr(soup):
    # Use s attribute instead of src for images
    for imgElement in soup.select('img[src]'):
        imgElement["s"] = imgElement["src"]
        del imgElement["src"]

    return soup


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print "Usage: " + sys.argv[0] + " <inFile> [outFile]"
        exit(1)

    file = sys.argv[1]
    soup = BeautifulSoup(open(file))
    soup = cleanUp(soup)
    soup = changeImgSrcAttr(soup)
    file = sys.stdout
    if len(sys.argv) > 2:
        file = open(sys.argv[2], 'w')
    file.write(soup.encode('utf-8'))
Example #41
0
        continue

    hud = hud.replace("Dinero", "Mono")
    hud = hud.replace("Día ", "Tago ")
    if hud:
        hud += "\n"
    passage = passages[name]
    # print(Fore.RED + str(passage))

    if not esperanto:
        continue
    passage.string = hud + esperanto
    print(name)

    eo_all_links = re.findall('\[\[([^\[\]]*)\]\]', esperanto)
    for r in eo_all_links:
        assert 1 <= len(r.split('|')) <= 2

    es_links = [
        r.split('|')[-1] for r in re.findall('\[\[([^\[\]]*)\]\]', spanish)
    ]
    eo_links = [
        r.split('|')[-1] for r in re.findall('\[\[([^\[\]]*)\]\]', esperanto)
    ]
    if es_links != eo_links:
        print(es_links, eo_links)
    assert es_links == eo_links

with open(output_file, "wb") as file:
    file.write(soup.encode(formatter="html"))
from bs4 import BeautifulSoup
import requests

URL = 'https://en.wikipedia.org/wiki/Google'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser').prettify()

text = str(soup.encode("UTF-8"))
file = open("input.txt", "w")
file.write(text)
file.close()

import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

for k in text.split("\n"):
    text1 = str(nltk.re.sub(r"[^a-zA-Z0-9]+", ' ', k))
    file = open("input1.txt", "w")
    file.write(text1)

stokens = nltk.sent_tokenize(text1)
wtokens = nltk.word_tokenize(text1)
for s in stokens:
    print(s)

tagged = nltk.pos_tag(wtokens)
Example #43
0
            filepath = os.path.join(args.source, relpath, filename)
            dstpath = os.path.join(doc_dir, relpath, filename)
            logging.info('Processing file {}'.format(filepath))

            indexs = list()
            try:
                assert filepath.endswith('.html')
                with open(filepath, 'rb') as src:
                    soup = BeautifulSoup(src.read().decode('utf8'), 'lxml')
                    for index in extract_cppmodule(soup):
                        indexs.append(index)
                    for index in extract_sectionlink(soup):
                        indexs.append(index)
                    remove_navbar(soup)
                    with open(dstpath, 'wb') as dst:
                        dst.write(soup.encode('utf8'))
            except AssertionError:
                # except:
                shutil.copy(filepath, dstpath)

            for name, typ, pos in indexs:
                name = re.sub('\s+', ' ', name)
                assert '\n' not in name
                if '#' in pos[1:] or '/' in pos:
                    continue
                # print(name, typ, pos)
                # assert('#' not in pos[1:])
                if not pos.startswith('#'):
                    pos = '#' + pos
                pos = os.path.join(relpath, filename) + pos
                cur.execute(
from bs4 import BeautifulSoup

html_encoding = 'gbk'

wanted_html_part = r'''
<td style="width: 700px; height: 20px;font-size:13.5px; " valign="middle">
    <a href="https://example.com/dist/standalone.html?eid=xxx" target="_blank">xxx</a>
</td>
'''.strip().decode("utf-8")  # type: unicode

with open("Default.aspx") as fp:
    html_content = fp.read()

wanted_html = html_content.decode(html_encoding)  # type: unicode

# https://blog.csdn.net/adinlead/article/details/53897409
soup = BeautifulSoup(
    wanted_html, 'html.parser'
)  # do NOT use 'lxml' or will lost '.aspx' tags, but not good enough

wanted_res = soup.find('td',
                       style="width: 700px; height: 20px;font-size:13.5px; ")
# Beautiful Soup replaces < with &lt;
# https://stackoverflow.com/questions/52040260/beautiful-soup-replaces-with-lt
wanted_res.replace_with(wanted_html_part)  # keep html tag in replace procedure
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/#output-formatters
original_html_content = soup.encode(html_encoding, formatter=None)

with open("Default1.aspx", 'w') as fp:
    fp.write(original_html_content)
Example #45
0
kursor.execute("delete from corona")

# r = requests.get("https://pomber.github.io/covid19/timeseries.json").json()

# number = 1

# for y in r['Indonesia']:

# 	kursor.execute("insert into corona(id,negara,date,confirmed,deaths,recovered,datetime) values(%s,%s,%s,%s,%s,%s,%s)", (number,'Indonesia',y['date'],y['confirmed'],y['deaths'],y['recovered'],datetime.datetime.now(pytz.timezone('Asia/Jakarta'))))

# 	number += 1

page = requests.get(
    'https://www.worldometers.info/coronavirus/country/indonesia/').text
soup = BeautifulSoup(page, 'html.parser')
soup.encode('utf-8')

# cases = soup.find("div", {"class": "maincounter-number"}).find("span", {"style", {"color" : "#aaa"}})[0].get_text().strip()

cases = soup.find_all("div", {"class": "maincounter-number"})

hasil = []

for x in cases:

    children = x.findChildren("span", recursive=True)

    for y in children:

        iwant = y.text.split(' ')[0].strip()
Example #46
0
# 使用request和bs4编写基本爬虫
# https://www.cnblogs.com/baojinjin/p/6819389.html
import requests
from bs4 import BeautifulSoup

res = requests.get('http://book.zongheng.com/chapter/734213/40615154.html')
res.encoding = 'utf-8'
# print(res.text)
soup = BeautifulSoup(res.text, 'html.parser')
# print(soup.encode('gb18030'))
print(soup.encode('utf-8', 'ignore'))


Example #47
0
# 2) Replace the main picture with a picture of yourself.
# 3) Replace any local images with the image I provided in media.  (You
# must keep the image in a separate folder than your html code.

# Deliverables
# Make sure the new page is uploaded to your GitHub account.
from bs4 import BeautifulSoup
import urllib
import requests
import re

base_url = 'http://collemc.people.si.umich.edu/data/bshw3StarterFile.html'
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'html.parser')

str_soup = soup.encode("ascii", "ignore").decode(
    "utf-8")  #encode and decode will convert to string

main_img_replace = str_soup.replace(
    "https://testbed.files.wordpress.com/2012/09/bsi_exposition_041316_192.jpg",
    "media/IMG_6293.jpg")  #replacing main picture with personal picture
local_img_replace = main_img_replace.replace(
    "logo2.png",
    "media/logo.png")  #replacing logo pictures with photo from media file

x = local_img_replace.replace(
    "student", "AMAZING student")  #replaces student with AMAZING student

fout = open("new_html.html", "w")  #creating a write-able html file
fout.write(x)  #writing into html file
fout.close()
Example #48
0
def get_release():
    req = requests.get('https://github.com/redis/redis/releases')
    html = req.text.encode('utf-8')
    soup = BeautifulSoup(html, 'html.parser')
    print(soup.encode('utf-8'))
Example #49
0
class Flipkart:
  def __init__(self, url):
    # self.url = "https://www.amazon.in/Bose-SoundLink-Wireless-Around-Ear-Headphones/dp/B0117RGG8E/ref=sr_1_11?qid=1562395272&refinements=p_89%3ABose&s=electronics&sr=1-11"
    self.url = url
    self.old_price = 99999999
    self.count = 0
    self.product_details = ""

  def request_with_ua(self):
    self.error = ""
    lines = open("user_agents.txt").read().splitlines()
    user_agent =  random.choice(lines)

    self.headers = { 'User-Agent' : user_agent } 
    print("New Flipkart request with : ", user_agent)
    response = requests.get(self.url, headers=self.headers)

    self.soup = BeautifulSoup(response.content, 'html.parser')
    self.soup.encode('utf-8')

    try:
      #check whether browser version is supported or not
      self.error = self.soup.find("div", {"class": "popup-header"}).text.strip()
      print("Browser is no longer supported")

    except Exception as be:
      print("Browser is supported")
      # print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(be).__name__, be)
      pass
      # print(be)

    # f = open("soup.html", "w",encoding= "utf-8")
    # f.write(str(self.soup))
    # f.close()
  
  def check_price(self):    
    try:
      
      self.request_with_ua()

      while(self.error=="Your browser is no longer supported"):
        print("User agent switching\n")
        self.request_with_ua()

      try:
        # print("current price block")
        self.title = self.soup.find("span", {"class": "_35KyD6"}).text
        self.current_price = self.soup.find("div", {"class": "_1vC4OE _3qQ9m1"}).get_text().replace(',', '').replace('₹', '').replace(' ', '').strip()
        # print(self.current_price)
      except:
        # self.current_price = soup.find(id = "priceblock_dealprice").get_text().replace(',', '').replace('₹', '').replace(' ', '').strip()
        print("self.current_price exception")

      try:
        self.current_price = int(self.current_price.split(".")[0])
        self.review_count = self.soup.find("span", {"class": "_38sUEc"}).get_text()
        self.stars = self.soup.find("div", {"class": "hGSR34"}).text
        
        try:
          self.product_dict = {'Product Name': self.title, 'price': self.current_price, 'stars':self.stars, 'Number of reviews and ratings': self.review_count}
        except NameError as e:
          self.product_dict = {'Product Name': self.title, 'price': self.current_price, 'stars': "Unable to fetch", 'Number of reviews and ratings': "Unable to fetch"}

        for key,value in self.product_dict.items():
          self.product_details = self.product_details + str(key) + " : "+ str(value) + "\n"
        # # print(json.dumps(jsonObject, indent=2))

        if(self.current_price < self.old_price):
          self.old_price = self.current_price
          if self.count == 0:
            return self.product_details
          self.count = 1
          return False
        else:
          return self.product_details

      except Exception as qq:
        print("second end block")
        print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(qq).__name__, qq)

    except Exception as ww:
      print("end block")
      print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(ww).__name__, ww)
Example #50
0
from bs4 import BeautifulSoup

waybillNo_list =['810131162977', '810131167088', '810131151219', '810131166299']

headers = {
    'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Encoding':'gzip, deflate',
    'Accept-Language':'en,en-US;q=0.8,zh;q=0.6,zh-CN;q=0.4',
    'Cache-Control':'max-age=0',
    'Connection':'keep-alive',
    'Content-Length':'46',
    'Content-Type':'application/x-www-form-urlencoded',
    'Host':'trace.yto.net.cn:8022',
    'Origin':'http://www.yto.net.cn',
    'Referer':'http://www.yto.net.cn/gw/index/index.html',
    'Upgrade-Insecure-Requests':'1',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

for waybillNo in waybillNo_list:

    payload = {
        'waybillNo': waybillNo
    }

    res = requests.post("http://trace.yto.net.cn:8022/TraceSimple.aspx",data = payload, headers = headers)

    result = BeautifulSoup(res.text,"html5lib").select('.data')[-1].text.strip()
    
    print waybillNo + "; "+ str(result.encode('utf8')).replace("感谢使用圆通速递,期待再次为您服务", "")
Example #51
0
def test_reporting(rf, admin_user):

    product_price = 100
    product_count = 2
    tax_rate = Decimal("0.10")
    line_count = 1

    expected_taxful_total, expected_taxless_total, shop, order = initialize_report_test(product_price,
                                                                                        product_count,
                                                                                        tax_rate,
                                                                                        line_count)

    with override_provides("reports", [__name__ + ":SalesTestReport"]):
        data = {
            "report": SalesTestReport.get_name(),
            "shop": shop.pk,
            "date_range": DateRangeChoices.THIS_YEAR.value,
            "writer": "json",
            "force_download": 1,
        }

        view = ReportView.as_view()
        request = apply_request_middleware(rf.post("/", data=data), user=admin_user)
        response = view(request)
        if hasattr(response, "render"):
            response.render()
        assert response.status_code == 200
        json_data = json.loads(response.content.decode("utf-8"))
        assert force_text(SalesTestReport.title) in json_data.get("heading")
        totals = json_data.get("tables")[0].get("totals")
        return_data = json_data.get("tables")[0].get("data")[0]
        assert int(totals.get("product_count", 0)) == product_count
        assert int(return_data.get("product_count", 0)) == product_count
        assert int(totals.get("order_count", 0)) == 1
        assert int(return_data.get("order_count", 0)) == 1
        assert str(expected_taxless_total) in totals.get("taxless_total", "0")
        assert str(expected_taxful_total) in totals.get("taxful_total", "0")

        today = date.today()
        last_year = date(today.year - 1, 1, 1)
        next_year = date(today.year + 1, 1, 1)

        # test report without downloading it
        data = {
            "report": SalesTestReport.get_name(),
            "shop": shop.pk,
            "date_range": DateRangeChoices.CUSTOM.value,
            "start_date": last_year.strftime("%Y-%m-%d"),
            "end_date": next_year.strftime("%Y-%m-%d"),
            "writer": "json",
        }

        request = apply_request_middleware(rf.post("/", data=data), user=admin_user)
        response = view(request)
        assert response.status_code == 200

        soup = BeautifulSoup(response.render().content)
        response_text = str(six.u(soup.encode('ascii')))
        assert force_text(SalesTestReport.title) in response_text
        assert str(expected_taxless_total) in response_text
        assert str(expected_taxful_total) in response_text
Example #52
0
    writer = csv.writer(f1, lineterminator='\n', )
    writer.writerow(headerrow)
    for page in range(1, 10000000):

        page_url = "https://www.thredup.com/products/women/shorts?department_tags=women&page=" + str(
            page) + "&search_tags=women-shorts&sort=Newest+First"
        #		uClient = uReq(page_url)
        print(page_url)
        scrappage = requests.get(page_url)
        html_doc = scrappage.text
        page_soup = BeautifulSoup(html_doc, 'lxml')
        # parses html into a soup data structure to traverse html
        # as if it were a json data type.
        # page_soup = BeautifulSoup(uClient.read(), "html.parser")
        # print(page_soup.prettify())
        errtxt1 = page_soup.encode("utf-8")
        errtxt = page_soup.prettify()
        errfound = False
        errfound1 = False
        if errtxt.find('Try removing some filters to see more items.') >= 0:
            errfound = True
        if errtxt.find('We couldn\'t find anything matching your search.') >= 0:
            errfound1 = True

        if errfound and errfound1:
            errorpage = errorpage + '\n' + 'Error or no data found Record No : ' + str(j) + ' Page No : ' + str(page)
            nexterrorpage = nexterrorpage + 1
        if nexterrorpage > consterror:
            print(errorpage)
            break
        # 	print("The ERROR IS RAISED")
Example #53
0
    def __init__(self, username, downloadPhotos):
        self.useragents = [
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
            'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
            'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
            'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'
        ]

        self.username = username
        # Make the directory that we are putting the files into
        self.make_directory()
        print(colors.OKGREEN + f"[*] Starting Scan on {self.username}" +
              colors.ENDC)
        # Get the html data with the requests module
        r = requests.get(
            f'http://instagram.com/{self.username}',
            headers={'User-Agent': random.choice(self.useragents)})
        soup = BeautifulSoup(r.text, 'html.parser')
        # To prevent a unicode error, we need the following line...
        soup.encode('utf-8')
        # Find the tags that hold the data we want to parse
        general_data = soup.find_all('meta',
                                     attrs={'property': 'og:description'})
        more_data = soup.find_all('script', attrs={'type': 'text/javascript'})
        description = soup.find('script',
                                attrs={'type': 'application/ld+json'})
        # Try to parse the content -- if it fails then the program exits
        try:
            self.text = general_data[0].get('content').split()
            # This is the profile description data
            self.description = json.loads(description.get_text())
            # This is the javascript json that is passed into json.loads()
            self.profile_meta = json.loads(
                more_data[3].get_text()[21:].strip(';'))

        except:
            print(colors.FAIL + f"Username {self.username} not found" +
                  colors.ENDC)
            sys.exit()
        self.profile_data = {
            "Username":
            self.profile_meta['entry_data']['ProfilePage'][0]['graphql']
            ['user']['username'],
            "Profile name":
            self.description['name'],
            "URL":
            self.description['mainEntityofPage']['@id'],
            "Followers":
            self.text[0],
            "Following":
            self.text[2],
            "Posts":
            self.text[4],
            "Bio":
            str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql']
                ['user']['biography']),
            "profile_pic_url":
            str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql']
                ['user']['profile_pic_url_hd']),
            "is_business_account":
            str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql']
                ['user']['is_business_account']),
            "connected_to_fb":
            str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql']
                ['user']['connected_fb_page']),
            "externalurl":
            str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql']
                ['user']['external_url']),
            "joined_recently":
            str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql']
                ['user']['is_joined_recently']),
            "business_category_name":
            str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql']
                ['user']['business_category_name']),
            "is_private":
            str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql']
                ['user']['is_private']),
            "is_verified":
            str(self.profile_meta['entry_data']['ProfilePage'][0]['graphql']
                ['user']['is_verified'])
        }

        # Tries to scrape posts if it is a public profile
        self.save_data()
        if downloadPhotos == True:
            self.scrape_posts()
        self.print_data()
Example #54
0
class Amazon:
  def __init__(self, url):
    # self.url = "https://www.amazon.in/Bose-SoundLink-Wireless-Around-Ear-Headphones/dp/B0117RGG8E/ref=sr_1_11?qid=1562395272&refinements=p_89%3ABose&s=electronics&sr=1-11"
    self.url = url
    self.old_price = 99999999
    self.count = 0
    self.product_details = ""

  def request_with_ua(self):
    self.error = ""
    lines = open("user_agents.txt").read().splitlines()
    user_agent =  random.choice(lines)

    self.headers = { 'User-Agent' : user_agent } 
    print("New Amazon request with : ", user_agent)
    
    response = requests.get(self.url)
    # response = requests.get(self.url, headers=self.headers)

    self.soup = BeautifulSoup(response.content, 'html.parser')
    self.soup.encode('utf-8')

    try:
      #check whether browser version is supported or not
      self.error = self.soup.find("div", {"class": "popup-header"}).text.strip()
      print("Browser is no longer supported")

    except Exception as bb:
      try:
        # when browser is supported then check captcha page
        self.error = self.soup.find("div", {"class": "a-box a-alert a-alert-info a-spacing-base"}).text.strip()
        print("Browser Captcha error")
        print("Self.error:",self.error)
        # print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(bb).__name__, bb)
      except Exception as bc:
        print("Browser is supported")
        # print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(bc).__name__, bc)
        pass
      # print(be)
    
    f = open("soup.html", "w",encoding= "utf-8")
    f.write(str(self.soup))
    f.close()

  def check_price(self):
    try:

      self.request_with_ua()
      while(self.error=="Your browser is no longer supported" or self.error.startswith("Enter the characters you see below")):
        print("User agent switching\n")
        self.request_with_ua()

      try:
        # print("current price block")
        self.title = self.soup.find(id= "productTitle").get_text().strip()
        self.current_price = self.soup.find(id = "priceblock_ourprice").get_text().replace(',', '').replace('₹', '').replace(' ', '').strip()
      except:
        print("Its a special deal price")
        self.current_price = self.soup.find(id = "priceblock_dealprice").get_text().replace(',', '').replace('₹', '').replace(' ', '').strip()

      try:  
        self.current_price = int(self.current_price.split(".")[0])
        self.review_count = self.soup.find(id="acrCustomerReviewText").get_text().split()[0]
        self.stars = self.soup.find(id = "acrPopover").get_text().strip()

        try:
          self.product_dict = {'Product Name': self.title, 'price': self.current_price, 'stars':self.stars, 'Number of reviews': self.review_count}
        except NameError as e:
          self.product_dict = {'Product Name': self.title, 'price': self.current_price, 'stars':"Unable to fetch", 'Number of reviews': "Unable to fetch"}

        for key,value in self.product_dict.items():
          self.product_details = self.product_details + str(key) + " : "+ str(value) + "\n"
        # print(json.dumps(jsonObject, indent=2))

        if(self.current_price < self.old_price):
          self.old_price = self.current_price
          if self.count == 0:
            return self.product_details
          self.count = 1
          return False
        else:
          return self.product_details

      except Exception as pqpq:
        print("second end block")
        print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(pqpq).__name__, pqpq)

    except Exception as qq:
        print("end block")
        print('Error on line {}'.format(sys.exc_info()[-1].tb_lineno), type(qq).__name__, qq)


    # function that sends an email if the prices fell down
    def send_mail():
      server = smtplib.SMTP('smtp.gmail.com', 587)
      server.ehlo()
      server.starttls()
      server.ehlo()

      server.login('*****@*****.**', 'password')

      subject = 'Price Fell Down'
      body = "Check the amazon link https://www.amazon.in/Bose-SoundLink-Wireless-Around-Ear-Headphones/dp/B0117RGG8E/ref=sr_1_11?qid=1562395272&refinements=p_89%3ABose&s=electronics&sr=1-11 "

      msg = f"Subject: {subject}\n\n{body}"
      
      server.sendmail(
        '*****@*****.**',
        '*****@*****.**',
        msg
      )
      #print a message to check if the email has been sent
      print('Hey Email has been sent')
      # quit the server
      server.quit()
Example #55
0
def scrape():
    # RANDOM TIMER TO MAKE ANY LOOPING CALLS TO A URL APPEAR MORE "HUMAN"
    rLow = int(g['LOOP_RNDM_SLEEP_LOW'])
    rHigh = int(g['LOOP_RNDM_SLEEP_HIGH'])
    rndm_sleep = random.randint(rLow, rHigh)
    # CALCULATE RETENTION DATE FROM RETENTION DAYS VARIABLE IN VARS TABLE
    retention_date = datetime.date.today() + datetime.timedelta(
        -int(g['DATA_RETENTION_DAYS']))
    retention_date_id = retention_date.strftime('%Y%m%d')
    # =============================================================================
    # DELETE FROM LOCAL DB WHERE A RERUN WOULD REPRODUCE "DUPLICATE" DATA
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""DELETE FROM {0} WHERE (msmt_dte_id = {1} or msmt_dte_id <= {2}) and cntry_cde = '{3}' and site_cde = '{4}'""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        retention_date_id,  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE']  #[4]
    )
    dbmgr.query(q)
    # =============================================================================
    # PASS URL TO RETURN HTML FROM SITE PAGE
    # CAPTURE ANY ERRORS EXPERIENCED AND PASS TO LOCAL DB
    # =============================================================================
    url = g['URL'] + g['URL_PART1']
    passedHTML = pyHTMLPass.htmlPass(url, **g)
    soup = BeautifulSoup(passedHTML, "html.parser")
    # ==========================================================================================================================================================
    # SCRAPE PART - START
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE DB statements
    # ==========================================================================================================================================================
    # PASS 1 - TOTAL COUNT ========================================================================
    facet_type = 'TOTAL'
    facet_desc = 'ALL JOBS'
    nbr = re.search('<title>(.*?)</title>', str(soup.encode("utf-8"))).group(1)
    nbr = str(nbr).replace(',', '')
    nbr = re.findall('\d+', nbr)
    facet_count = nbr[0]
    facet_count = int(facet_count)
    # =============================================================================
    # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
    # =============================================================================
    dbmgr = pyDB(g['DB'])
    q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
        g['TBL_NME'],  #[0]
        g['MSMT_DTE_ID'],  #[1]
        g['DATA_TYPE'],  #[2]
        g['CNTRY_CDE'],  #[3]
        g['SITE_CDE'],  #[4]
        facet_type,  #[5]
        facet_desc,  #[6]
        facet_count,  #[7]
        g['STARTED_AT'],  #[8]
        ''  #[9]
    )
    dbmgr.query(q)
    # PASS 2 - INDUSTRY COUNT =====================================================================
    for ul in soup.find_all('ul', class_='facet'):
        for li in ul.find_all('li'):
            # return the facet text (section title)
            facet = li.find(
                'strong'
            )  # assumes the first row of the facet is the "title" row - breaks if it isnt
            if facet:
                facet_type = facet.text.upper()
            else:
                facet_type = facet_type.upper(
                )  # if None is found, apply current facet_type value to next facet_type value

            facet_desc = li.find('a')

            if facet_desc:  # checks if there is a result on the search for the "a" anchor (removes the title of the sections by default - returned above)
                facet_desc = facet_desc.text.upper()
                facet_desc = re.sub(
                    r"[!@#$']", '',
                    str(facet_desc))  # removes special characters from string
                facet_count = li.find('span')
                facet_count = int(facet_count.text.replace(',', ''))
                # =============================================================================
                # WRITE RESULTS OF SOUP ANALYISIS/SCRAPE TO LOCAL DB
                # =============================================================================
                dbmgr = pyDB(g['DB'])
                q = r"""INSERT INTO {0} (MSMT_DTE_ID, DATA_TYPE, CNTRY_CDE, SITE_CDE, FACET_TYPE, FACET_DESC, FACET_CNT, STARTED_AT, FINISHED_AT) VALUES ({1}, '{2}', '{3}', '{4}', '{5}', '{6}', '{7}', '{8}', '{9}')""".format(
                    g['TBL_NME'],  #[0]
                    g['MSMT_DTE_ID'],  #[1]
                    g['DATA_TYPE'],  #[2]
                    g['CNTRY_CDE'],  #[3]
                    g['SITE_CDE'],  #[4]
                    facet_type,  #[5]
                    facet_desc,  #[6]
                    facet_count,  #[7]
                    g['STARTED_AT'],  #[8]
                    ''  #[9]
                )
                dbmgr.query(q)
            else:  # if no "a" anchor is found, ignore
                None
    # =============================================================================
    # WRITE HTML PAGE TO FILE
    # =============================================================================
    if g['WRITE_HTML_TO_FILE'] == 'Y':
        file_name = g['MSMT_DTE_ID'] + '_' + g['CNTRY_CDE'] + '_' + g[
            'SITE_CDE'] + '_' + 'SITE_LISTING' + '.html'
        with open(g['CONFIG']['DB_DIR'] + '__html\\' + file_name,
                  'w+',
                  encoding='utf-8') as f:
            f.writelines(str(soup))
        f.close()
    # ==========================================================================================================================================================
    # SCRAPE PART - END
    # - this should be the primary section of code that changes
    # - only other sections that "may" change are DELETE and UPDATE db statements
    # ==========================================================================================================================================================
    # =============================================================================
    # UPDATE LOCAL DB WITH A FINISH TIME
    # =============================================================================
    finished_at = time.strftime(
        "%Y-%m-%d %H:%M:%S")  # capture a finish time to be entered into the db
    dbmgr = pyDB(g['DB'])
    q = r"""UPDATE {0} SET finished_at = '{1}' WHERE cntry_cde = '{2}' and msmt_dte_id = {3}""".format(
        g['TBL_NME'],  #[0]
        finished_at,  #[1]
        g['CNTRY_CDE'],  #[2]
        g['MSMT_DTE_ID']  #[3]
    )
    dbmgr.query(q)
Example #56
0
url = "https://www.soccerstats.com/homeaway.asp?league=denmark"
data = requests.get(url, time.sleep(2))
soup = BeautifulSoup(data.content)
div = soup.find("div", id="h2h-team2")
table = div.find("table", id="btable")
f = open("denmark1/awayRows.txt", "w")
f.write(str(table))
f.close()
print("Away Rows Complete")
counter += 1
print(str(counter) + " out of " + str(total))

url = "https://www.soccerstats.com/results.asp?league=denmark&pmtype=bygameweek"
data = requests.get(url, time.sleep(2))
soup = BeautifulSoup(data.content)
soup.encode(formatter=None)
div = soup.find("div", class_="tabbertabdefault")
table = div.find("table", id="btable")
f = open("denmark1/fixtures.txt", "w")
f.write(str(table))
f.close()
print("Fixtures Complete")
counter += 1
print(str(counter) + " out of " + str(total))

url = "https://www.soccerstats.com/results.asp?league=denmark&pmtype=bydate"
data = requests.get(url, time.sleep(2))
soup = BeautifulSoup(data.content)
div = soup.find_all("table", id="btable")
f = open("denmark1/seasonFixtures.txt", "w")
f.write(str(div[0]))
Example #57
0
#result = requests.get("https://www.amazon.com/HP-EliteDesk-800-G1-Refurbished/dp/B0784F3NHF", headers=headers)
result = requests.get("https://www.amazon.com/HP-EliteDesk-800-G1-Refurbished/dp/B0784F82Q5", headers=headers)

#you can get the status code for the page
#print(result.status_code)

#print(result.headers)


src = result.content

#lxml needs to be installed separately
soup = BeautifulSoup(src, 'lxml')

#Change from soup object to byte object
textable = soup.encode('utf-8')

#Change from byte object to string object
encoding = 'utf-8'
transformToString = textable.decode(encoding)
#Print object type
print(type(transformToString))



#-------this works for In Stock
#foundStringIndex = transformToString.find("In Stock")
#print(foundStringIndex)
#shownString = transformToString[foundStringIndex : foundStringIndex+8]
#print(shownString)
Example #58
0
def consolidate_report():
    pass_count = 0
    fail_count = 0
    with open(test_results_file, 'r') as html_file:
        for line in html_file:
            if 'PASS' in line:
                if '<b>' in line:
                    pass_count += 1
            if 'FAIL' in line:
                if '<b>' in line:
                    fail_count += 1
    total_count = pass_count + fail_count

    consolidate_table = """
        <table border="2">
            <col width="150">
            <col width="150">
                <col width="150">
                <tr bgcolor="#b3ffff">
                        <th colspan="4" style="font-size:19px">Consolidated Report</th>
                </tr>
                <tr>
                        <th style="font-size:17px">Total</th>
                        <th style="font-size:17px">Passed</th>
                        <th style="font-size:17px">Failed</th>
                </tr>
                <tr align="center"> <td style="font-size:17px">{0}</td>
                     <td><font color=green style="font-size:17px"><b>{1}</b></td>
                     <td><font color=red style="font-size:17px"><b>{2}</b></td>
                </tr>
            </table>
        <br>
            """.format(total_count, pass_count, fail_count)
    with open(test_results_file, 'r') as f2:
        ogcontent = f2.read()
    with open(test_results_file, 'w') as f3:
        f3.write(consolidate_table)
    styl = '''
    <style>
         pre {
            overflow-x: auto;
            white-space: pre-wrap;
            white-space: -moz-pre-wrap;
            white-space: -pre-wrap;
            white-space: -o-pre-wrap;
            word-wrap: break-word;
         }
      </style>
    '''
    with open(test_results_file, 'a') as f4:
        f4.write(styl)
        f4.write(ogcontent)

    from bs4 import BeautifulSoup
    with open(test_results_file, 'r') as f:
        soup = BeautifulSoup(f, 'html.parser')
    l1 = soup.findAll('table', {'border': '1'})
    for each in l1:
        i = 1
        children = each.findChildren('b')
        for child in children:
            if child.string != 'FAIL' and child.string != 'PASS':
                child.string = "{}. ".format(i) + child.string
                i += 1
    with open(test_results_file, "wb") as f_output:
        f_output.write(soup.encode('utf8'))
Example #59
0
reload(sys)
sys.setdefaultencoding('utf8')

from selenium import webdriver
from bs4 import BeautifulSoup
import time

browser = webdriver.Chrome()
url = 'http://loudong.360.cn/Loo/index/search/%E4%BA%91/p/{page}.html'
dirpath = os.getcwd()
filepath = os.path.join(dirpath, 'butian_{page}.html')

for i in range(1, 73):
    browser.get(url.format(page=i))
    soup = BeautifulSoup(browser.page_source, 'lxml')
    print soup.encode("utf8")

    with open(filepath.format(page=i), 'w') as f:
        f.write(browser.page_source.encode("utf8"))
    f.close()
    #browser.delete_all_cookies()
    time.sleep(2)

browser.close()

respath = os.path.join(dirpath, 'res_butian')
for i in range(1, 73):
    soup = BeautifulSoup(open(filepath.format(page=i)), 'lxml')

    f = open(respath, 'a')
Example #60
0
start_time = time.time()

startpoint = 1
for full in fulls:
    site = full[2]
    org = full[0]
    smo_id = full[3]
    term = full[1]
    year = full[4]
    driver = webdriver.Chrome("CHROMEDRIVER DIRECTORY")
    driver.get(site)  #get original site info
    source = driver.page_source
    #text = requests.get(site).text #text version of getting requests
    #soup = BeautifulSoup(text) #use text version to get site soup
    soup = BeautifulSoup(source, "html.parser")
    soup2 = soup.encode("utf-8")
    try:
        #resultno = re.findall('English</a><span class="resultscount"> (.*?)\xe2\x80\x8e</span>',soup2)
        #resultno = re.findall('<h1 id="pqResultsCount">\n(.*?) results\n</h1>',soup2)
        resultno = re.findall('<h1 id="pqResultsCount">\n(.*?) result', soup2)
        resultno = ''.join(resultno)
        resultno = resultno.translate(None, "(){}<>,")
        resultno = int(resultno)
    except ValueError, e:
        resultno = int(0)
    no_pages = int(math.ceil(resultno / 20))
    #encrypt = re.findall('href="https://search.proquest.com/docview/(.*?)/',soup2)
    an = re.findall('{"(.*?)markedlistcheckbox:markallitems', soup2)
    an = ''.join(an)
    an = re.findall('markAll":false,"formats":{(.*?)},"markURL"', an)
    an = ''.join(an)