Python bsoup Examples, bs4.bsoup Python Examples

Example #1

0

Show file

File: notebook.py Project: fuzzyklein/helloworld

 def write(self, s):
     self.set_current_page(0)
     if "\(" in s or "\[" in s or "$$" in s:
         f = open("/home/fuzzy/Projects/helloworld/html/template.html")
         bs = bsoup(f.read(), XML_PARSER)
         f.close()
         bs("div")[0].append(bsoup(s, XML_PARSER))
         f = open("/home/fuzzy/Projects/helloworld/temp/output.html",'w')
         f.write(str(bs))
         f.close()
         self.current_view().open("http://localhost/helloworld/temp/output.html")
     else:
         self.current_view().get_dom_document().get_element_by_id("body").set_inner_html(s)

Example #2

0

Show file

File: GenderMe.py Project: Kenishi/FilterMe

def login():
	global m_session
	
	username = raw_input("Mixi Username: "******"Mixi Password: "******"check.pl" in res.text:
		print "Login Failed. Abort."
		sys.exit(1)
	print "Login Complete"

Example #3

0

Show file

File: hn2md.py Project: chudur-budur/stash

def fetch_saved_stories(uname, session, max_page=-1):
    all_stories = {}
    blank_url = 'https://news.ycombinator.com/saved?id={0:s}&p={1:d}'
    page = 1
    while True:
        url = blank_url.format(uname, page)
        response = session.get(url)
        print("fetching from page {0:d}: {1:s}.".format(page, url))
        assert response.status_code == 200, \
                "Unexpected status code: {0:d}".format(response.status_code)
        print("success.".format(page))
        soup = bsoup(response.content.decode('utf-8', 'ignore'), 'html.parser')
        stories = soup.find_all("td", class_ = "title")
        has_next_page = has_more(stories)
        if has_next_page:
            stories.pop()
        ranks = stories[0:-1:2]
        entries = stories[1:-1:2]
        entries.append(stories[-1])
        for idx,entry in enumerate(entries):
            rank = get_rank(str(ranks[idx]))
            title = get_title(str(entry))
            link = get_link(str(entry))
            all_stories[rank] = [title, link]
        if max_page > -1 and page == max_page:
            print("finish fetching {0:d} pages.".format(page))
            break
        if not has_next_page:
            print("finish fetching {0:d} pages.".format(page))
            break
        page = page + 1
        time.sleep(5)
    session.close()
    return all_stories

Example #4

0

Show file

def cars_call(zip, page_num):
    ##this is the original page
    #page = requests.get('https://www.cars.com/for-sale/searchresults.action/?page='+str(page_num)+'&perPage=100&rd=20&searchSource=PAGINATION&sort=relevance&zc='+str(zip))

    #This has all the cars sorted by there distacne from a given zip code
    page = requests.get(
        'https://www.cars.com/for-sale/searchresults.action/?page=' +
        str(page_num) +
        '&perPage=100&rd=99999&searchSource=GN_BREADCRUMB&sort=distance-nearest&zc='
        + str(zip))

    #convert to a beautiful soup object:
    soup = bsoup(page.content, features="lxml")
    #show contents
    soup.prettify()

    # auto tempest isn't working.. did they somehow make everything private? D:
    #start scraping find and find_all
    body = soup.find_all('script')

    #body = soup.find('class')
    #body = soup.find('div' )
    # yes or no answer if it is certified pre-owned
    #spec = body.find_all('span')
    return str(body)

Example #5

0

Show file

File: Live Stock Price.py Project: themahipal/mahi

def relstock():
    r = requests.get(
        "https://in.finance.yahoo.com/quote/RELIANCE.NS?p=RELIANCE.NS")
    soup = bsoup(r.text, "lxml")
    content = soup.find(
        'div', class_="My(6px) Pos(r) smartphone_Mt(6px)").find('span').text
    return content

Example #6

0

Show file

def get_response(url, site_tag):
    aggr_urls = []
    agent = {
        "User-Agent":
        'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
    }
    site_req_url = requests.get(url, headers=agent)
    site_req_html = bsoup(site_req_url.content, 'html.parser')

    if site_tag == "N7P":
        links = site_req_html.find_all('a', href=True, text=True, rel=True)
        for lk in links:
            for i in lk['rel']:
                if i == 'bookmark':
                    aggr_urls.append(lk.string)
                    aggr_urls.append(lk['href'])

    if site_tag == "W10":
        for i in site_req_html.select('article[class*="mspuloop"]'):
            lk = i.find_all('a', href=True, title=True)[0]
            aggr_urls.append(lk['title'])
            aggr_urls.append(lk['href'])

    aggr_urls.reverse()
    return (aggr_urls)

Example #7

0

Show file

File: amara_vids.py Project: aerenchyma/amara_code-opened

	def get_links(self):
		"""Assigns all vid links to self.links attribute and returns list of links"""
		r = requests.get("%s?page=last" % (self.base)).text
		soup = bsoup(r)
		lks = soup.findAll('a')
		self.links = lks
		return self.links

Example #8

0

Show file

File: start_diff_test.py Project: 0xKD/GooDork3-dev

def getCaptcha(URL,ref):
	#print "\tcaptcha@{%s}" % (URL)
	path = re.split("^http://(?:[\w\-_0-9].*?)\.(?:[\w\-_0-9].*?)\.(?:[\w\-_0-9]+)",URL)[1]
	#''.join(re.split('^/sorry/image\?id=',path)).rstrip('&amp;hl=en')
	image_id=1
	host = URL[:len(URL)-len(path)]
	#print "[%s][%s]" % (host,path)
	#print host.lstrip('http://')
	conn=httplib.HTTPConnection(ref)
	conn.putrequest('GET',path)
	conn.putheader('Referer',ref)
	conn.putheader('Accept','text/html')
	conn.putheader('User-agent','Internet Explorer 6.0')
	conn.endheaders()
	resp=conn.getresponse()
	html=resp.read()
	#print html
	#strip image files
	soup=bsoup(html)
	#form=soup.find('form',{'method':'get'})
	#print "---------------"
	#print form
	#print "---------------"
	imgs=soup.find('img')
	#print "---------------"
	print imgs
	#print "---------------"
	return

Example #9

0

Show file

File: wbdatarequest.py Project: R-a-y-Zhang/askIGO

def get_indicators (id_=None):
	tIndicator_dict = dict()
	r = requests.get('{}/{}'.format(BASE_URL, 'indicator'))
	soup = bsoup(r.text, 'lxml')
	topics = soup.findAll('h3', id=re.compile('topic-*'))
	main_div = soup.find('div', {'class': 'content-wrapper clear-block'})
	indicators = main_div.findAll('table', {'class': 'views-view-grid grid-2'})
	## print(len(topics), len(indicators))
	for i in range(len(indicators)):
		indList = []
		tIndicators = indicators[i].findAll('a')
		for ind in tIndicators:
			indList.append(ind.text)
		tIndicator_dict[topics[i].text] = indList
	if type(id_) == type(1):
		try:
			tid_list = list(tIndicator_dict.items())
			return {tid_list[id_-1][0]: tid_list[id_-1][1]}
		except:
			return None
	
	if type(id_) == type('string'):
		try:
			return tIndicator_dict[id_]
		except:
			return None
	
	return tIndicator_dict

Example #10

0

Show file

File: parseweb.py Project: amitprabhakar05/eruda

def parse_data(page):
    soup = bsoup(page, 'html.parser')
    relationdict = {}

    # This following code will give us the Synonyms, Antonyms and Related
    # words.
    wndata = soup.find('section', attrs={'data-src': 'wn'})
    postype = ''
    for row in wndata('tr'):
        for i, col in enumerate(row('td')):
            data = col(text=True)
            if i == 0:
                temptype = data
            if i == 1 and data == [u'1.']:
                postype = temptype[0]
            if postype not in relationdict:
                relationdict[postype] = {}
                relationdict[postype]['Syn'] = []
                relationdict[postype]['Ant'] = []
                relationdict[postype]['Rel'] = []
            relationdict[postype]['Syn'] += \
                _extract_rel_words(col('div',  attrs={'class': 'Syn'}))
            relationdict[postype]['Ant'] += \
                _extract_rel_words(col('div',  attrs={'class': 'Ant'}))
            relationdict[postype]['Rel'] += \
                _extract_rel_words(col('div',  attrs={'class': 'Rel'}))

    wndata = soup.find('section', attrs={'data-src': 'hm'})
    _extract_pseg_lemma(wndata('div', attrs={'class': 'pseg'}))
    _extract_pseg_lemma(wndata('div', attrs={'class': 'runseg'}))

    return relationdict

Example #11

0

Show file

File: generate_roi2xml_v1.py Project: gakarak/Camelyon2016Code

 def readXmlData(self, fnXML, isCleanEmptyGroups=True, isDebug=False):
     with open(fnXML,'r') as f:
         # (1) read data
         dataXML=bsoup(f.read(),'lxml-xml')
         if isDebug:
             print(dataXML.prettify())
         dictGroup={}
         # (2) parse groups
         for ii in dataXML.findAll('Group'):
             tmp=dict(ii.attrs)
             dictGroup[tmp['Name']]=[]
         if isDebug:
             print dictGroup
         # (3) iterate coords
         for i,ii in enumerate(dataXML.findAll('Annotation')):
             tmp=dict(ii.attrs)
             tIdGroup=tmp['PartOfGroup']
             lstCoords=[]
             for j,jj in enumerate(ii.findAll('Coordinate')):
                 tx=np.float(jj.attrs['X'])
                 ty=np.float(jj.attrs['Y'])
                 tp=np.int(jj.attrs['Order'])
                 lstCoords.append((tp,tx,ty))
                 if isDebug and ((j%2000)==0):
                     print tIdGroup, '(',i,')[',j,'] : {', tp,tx,ty,'}'
             arrCoords=np.array(lstCoords)
             sidx=np.argsort(arrCoords[:,0])
             arrCoords=arrCoords[sidx,1:]
             arrCoords=np.append(arrCoords,[arrCoords[0]],axis=0)
             dictGroup[tIdGroup].append(arrCoords)
         # (4) Clean empty Groups:
         if isCleanEmptyGroups:
             tlstKeyEmpty=[kk for kk,vv in dictGroup.items() if len(vv)<1]
             if isDebug:
                 print 'Empty keys: ', tlstKeyEmpty
             for kk in tlstKeyEmpty:
                 del dictGroup[kk]
         # (5) Create plain MplPaths:
         self.lstGroupId=[]
         self.lstPaths=[]
         for kk,gg in enumerate(dictGroup.values()):
             for ii in gg:
                 self.lstGroupId.append(kk)
                 self.lstPaths.append(mplPath.Path(ii))
             print kk, ' : ', len(gg)
         #
         self.lstSubPaths=[]
         for ii in self.lstPaths:
             tmp=[]
             for j0,jj in enumerate(self.lstPaths):
                 if ii!=jj:
                     if ii.contains_path(jj):
                         tmp.append(j0)
             self.lstSubPaths.append(tmp)
         #
         print '------------------'
         for i,ii in enumerate(self.lstSubPaths):
             print i, ' : ', len(ii), ' : ', dictGroup.keys()[self.lstGroupId[i]]
         #
         self.dataXML = dictGroup

Example #12

0

Show file

File: jd.py Project: l1905/simple_code

    def _handle_doc(self, html_doc):
        soup = bsoup(html_doc, "lxml")

        for ele in soup.find(id = "plist").ul.children:
            eleClass = ele['class'][0]
            #jd 商品
            if eleClass == "gl-item":
                aDom = ele.a
                reobj = re.compile(r'href="(.*?)".*ata-lazy-img="(.*?)"', re.IGNORECASE)
                result = reobj.findall(str(aDom))
                #商品详情页面url 
                skuUrl = result[0][0] 
                #图片 商品图片
                skuPic = result[0][1] 
                if skuUrl is not None:
                    reid = re.compile(r'/(\d+).html', re.IGNORECASE)
                    result = reid.findall(str(skuUrl))
                    # print result;
                    skuId = result[0]
                else:
                    skuId = ''

                #商品价格
                skuPrice = self.get_price(skuId)
                
                skuName = ele.find_all("div", class_="p-name")
                skuName = skuName[0]
                reName = re.compile(r'<em>(.*?)</em>(.*)')
                nameResult = reName.findall(str(skuName))
                skuName = nameResult[0][0]

                self._insert_database(skuId, skuUrl, skuName, skuPrice, skuPic)

Example #13

0

Show file

File: tdt2_parse.py Project: Lab41/pythia

def parse_tdt_by_topic(src_dir, doc_type, limit=0, lang=None):
    """
    Iterate over the complete list topics from the given file and parse as a dictionary of 
    Pythia clusters, mapped to an array of relevant docs.
    """
    logger.info("parse_tdt_topics(%s, %s)", src_dir, doc_type)
    topic_file = os.path.join(src_dir, REL_TOPIC_DIR, TOPIC_REL_FILE)
    clusters = dict()
    count = 0
    with open(topic_file) as fin:
        for line in fin:
            count += 1
            if (limit > 0) and (count >= limit):
                logger.info("Limit of %s documents reached.", limit)
                break
            ontopic = bsoup(line, "lxml").ontopic
            logger.debug(ontopic)
            if ontopic is not None:
                tdt_level = ontopic["level"]
                # Not considering stories with only BRIEF topic references
                if "BRIEF" == tdt_level:
                    continue
                post_id = ontopic["docno"]
                tdt_topicid = ontopic["topicid"]
                tdt_fileid = ontopic["fileid"]
                doc_date = tdt_fileid.split("_")[0]
                doc_src = "_".join(tdt_fileid.split("_")[-2:])
                # If a language was specified, limit to sources in the given language
                if lang is not None:
                    if "ENG" == lang and (doc_src not in ENGLISH_SRCS):
                        logger.debug("Skipping non-English source document.")
                        continue
                    if "MAN" == lang and (doc_src not in MANDARIN_SRCS):
                        logger.debug("Skipping non-Mandarin source document.")
                        continue
                cluster_id = "{topic}_{date}".format(topic=tdt_topicid, date=doc_date)
                cluster = clusters.get(cluster_id, dict())
                post = cluster.get(post_id, dict({"post_id": post_id}))
                post["tdt_level"] = tdt_level
                post["novelty"] = False
                if len(cluster) == 0:
                    post["novelty"] = True
                #                if 'BRIEF' == tdt_level:
                #                    post['novelty'] = False
                # FIXME - determine a realistic way to assign novelty
                #                post['novelty'] = 'TBD'
                post["cluster_id"] = cluster_id
                post["tdt_topicid"] = tdt_topicid
                # TODO: look at alternatives for settign order, e.g. timestamp
                post["order"] = len(cluster)
                post["tdt_fileid"] = tdt_fileid
                post["tdt_src"] = doc_src
                # TODO - get text from source file and add as 'body_text'
                post["body_text"] = extract_doc_text(src_dir, tdt_fileid, post_id)
                cluster[post_id] = post
                clusters[cluster_id] = cluster
    #                 logger.debug("Doc:")
    #                 logger.debug(cluster[post_id])
    return clusters

Example #14

0

Show file

File: parse_espn_nba.py Project: jreiher2003/espn_parse

def parse_espn(name):
    url = "http://espn.go.com/nba/team/schedule/_/name/" + name
    r = rq.get(url)
    soup = bsoup(r.content, 'lxml')

    ## Find all the rows that have classes. Remove the first one -- it's irrelevant.
    trs = soup.find_all("tr", class_=True)[1:]
    ## Main procedure.

    with open("nba_2016_csv/" + name + "_2016_schedule.csv", "wb") as ofile:
        f = csv.writer(ofile)
        ## Write the headers.
        f.writerow(["team", "date", "a/h", "opponent", "w/l", "s", "os"])
        for tr in trs:
            team = name
            tds = tr.find_all("td")
            date = tds[0].get_text().encode("utf-8")
            opp = tds[1].find_all("li", {"class": "team-name"})
            for teams in opp:
                other_team = teams.get_text()

            opponent = tds[1].find_all('li', {'class': 'game-status'})
            for o in opponent:
                h_a = o.get_text()
                try:
                    win_loss = tds[2].find_all(
                        'li', {'class': 'game-status win'}) or tds[2].find_all(
                            'li', {'class': 'game-status loss'})
                    for a in win_loss:
                        w_l = a.get_text()
                    score = tds[2].find_all('a')
                    for s in score:
                        if w_l == 'W':
                            gs = s.get_text().split("-")[0]
                            gs = int(gs)
                            ogs = s.get_text().split("-")[1].split(" ")[0]
                            ogs = int(ogs)
                        elif w_l == 'L':
                            gs = s.get_text().split("-")[1].split(" ")[0]
                            gs = int(gs)
                            ogs = s.get_text().split("-")[0]
                            ogs = int(ogs)
                except:
                    h_a = ''
                    w_l = None
                    other_team = ''
                    gs = None
                    ogs = None
            ## write the result to the CSV file.
                finally:
                    f.writerow([team, date, h_a, other_team, w_l, gs, ogs])
                    g = dict(date=date,
                             team=team,
                             h_a=h_a,
                             other_team=other_team,
                             w_l=w_l,
                             s=gs,
                             os=ogs)
                    connect().save(g)

Example #15

0

Show file

def get_euribor(dias, valores):
    # Fecha actual
    now = datetime.datetime.now()

    anyo = int(now.strftime("%Y"))
    # Defino año final 2006, que es hasta donde tengo datos de precios
    anyo_final = 2006
    mes = int(now.strftime("%m"))

    controlmesactual = True

    # Para cada año
    for i_anyo in range(anyo, anyo_final-1, -1):
        # Para cada mes
        for i_mes in range(mes, 0, -1):
            if controlmesactual:
                url = "https://www.idealista.com/news/euribor/mensual/mes-actual/"
                controlmesactual = False
            else:
                url = "https://www.idealista.com/news/euribor/mensual/%s-%d/" % (
                calendar.month_name[int(i_mes)], i_anyo)

            # Seleccionamos user agent aleatoriamente
            userAgent = random.choice(userAgents)

            # Cargamos cabeceras por defecto
            headers = requests.utils.default_headers()

            # Actualizamos cabeceras con el User-Agent aleatorio
            headers.update({'User-Agent': userAgent})

            # Espaciado entre peticiones (2 ó 3 segundos)
            sleep_secs = random.randrange(2, 4)
            time.sleep(sleep_secs)

            # Descargamos el sitio web de interés
            html = requests.get(url, headers=headers)

            soup = bsoup(html.content)

            contador = 0

            for dato in soup.body.tbody.find_all('td'):
                contador = contador + 1
                if contador % 2 == 1:
                    # fecha
                    fecha = "%d%s%s" % (i_anyo, '{:02d}'.format(i_mes), '{:02d}'.format(int(dato.string)))
                    # Añadimos la fecha a la lista
                    dias.append(fecha)
                    print(fecha)
                else:
                    # euribor
                    euribor = dato.string[:-1].replace(",", ".")
                    # Añadimos el euribor a la lista
                    valores.append(float(euribor))
                    print(float(euribor))
        mes = 12

    return euribor

Example #16

0

Show file

File: tdt2_parse.py Project: colinsongf/pythia

def parse_tdt_by_topic(src_dir, doc_type, limit = 0, lang = None):
    """
    Iterate over the complete list topics from the given file and parse as a dictionary of 
    Pythia clusters, mapped to an array of relevant docs.
    """
    logger.info('parse_tdt_topics(%s, %s)', src_dir, doc_type)
    topic_file = os.path.join(src_dir, REL_TOPIC_DIR, TOPIC_REL_FILE)
    clusters = dict()
    count = 0
    with open(topic_file) as fin:
        for line in fin:
            count+=1
            if (limit > 0) and (count >= limit): 
                logger.info('Limit of %s documents reached.', limit)
                break
            ontopic = bsoup(line, 'lxml').ontopic
            logger.debug(ontopic)
            if ontopic is not None:
                tdt_level = ontopic['level']
                # Not considering stories with only BRIEF topic references
                if 'BRIEF' == tdt_level:
                    continue
                post_id = ontopic['docno']
                tdt_topicid = ontopic['topicid']
                tdt_fileid = ontopic['fileid']
                doc_date = tdt_fileid.split('_')[0]
                doc_src = "_".join(tdt_fileid.split('_')[-2:])
                # If a language was specified, limit to sources in the given language
                if lang is not None:
                    if 'ENG' == lang and (doc_src not in ENGLISH_SRCS):
                        logger.debug("Skipping non-English source document.")
                        continue
                    if 'MAN' == lang and (doc_src not in MANDARIN_SRCS):
                        logger.debug("Skipping non-Mandarin source document.")
                        continue
                cluster_id = '{topic}_{date}'.format(topic=tdt_topicid,date=doc_date)
                cluster = clusters.get(cluster_id, dict())
                post = cluster.get(post_id, dict({'post_id':post_id}))
                post['tdt_level'] = tdt_level
                post['novelty'] = False
                if len(cluster) == 0:
                    post['novelty'] = True
#                if 'BRIEF' == tdt_level: 
#                    post['novelty'] = False
                # FIXME - determine a realistic way to assign novelty
#                post['novelty'] = 'TBD'
                post['cluster_id'] = cluster_id
                post['tdt_topicid'] = tdt_topicid
                # TODO: look at alternatives for settign order, e.g. timestamp
                post['order'] = len(cluster)
                post['tdt_fileid'] = tdt_fileid
                post['tdt_src'] = doc_src
                # TODO - get text from source file and add as 'body_text'
                post['body_text'] = extract_doc_text(src_dir, tdt_fileid, post_id)
                cluster[post_id] = post
                clusters[cluster_id] = cluster
#                 logger.debug("Doc:")
#                 logger.debug(cluster[post_id])
    return clusters

Example #17

0

Show file

File: spider.py Project: 5l1v3r1/py3-scripts

def filtersoup(html):
    data = []
    page = bsoup(html)
    for link in page.FindAll('a'):
        data.append('href')
    for link in page.FindAll('form'):
        data.append('action')
    return data

Example #18

0

Show file

File: amara_vids.py Project: aerenchyma/amara_code-opened

	def num_acct_pages(self):
		"""This should return an integer that is the number of pages of vids in the account"""
		try:
			sp = bsoup(requests.get(self.base).text)
			num = sp.find('a',href='?page=last').text
			return int(num)
		except:
			return 2 # if there's only one page, there's no 'last' -- should only go through 1 (range(1,1))

Example #19

0

Show file

File: Preceedings.py Project: pombredanne/PythonProjects

def scrap_preceeding(base_url):
    homepage_html_content = web.download(base_url)
    homepage_soup = bsoup(homepage_html_content)
    ul_content = homepage_soup.find_all('ul')
    a_content = bsoup(str(ul_content)).find_all('a')
    volume_page_links = []
    for raw_link in a_content:
        volume_page_links.append(join(base_url, raw_link.get('href'))+'/')


    os.chdir('/home/sorkhei/Desktop/LDA-Papers/JMLR/Preceedings/')

    for base_link in volume_page_links[32:]:
        folder_name = base_link.split('/')[-2]
        address = os.path.join(os.getcwd(), folder_name)
        if not os.path.exists(address):
            os.mkdir(folder_name)
        else:
            index = 1
            while os.path.exists(address):
                folder_name = base_link.split('/')[-2] + '-' + str(index)
                print folder_name
                address = os.path.join(os.getcwd(), folder_name)
                index += 1
            os.mkdir(folder_name)

        os.chdir(address)


        print '--------------'
        print 'downloading from ' + base_link
        volume_content_soup = bsoup(web.download(base_link)).find_all('div', {'id': 'content'})
        a_content = bsoup(str(volume_content_soup)).find_all('a')
        # print a_content
        pdf_links = [join(base_link, link.get('href')) for link in a_content if str(link.get('href')).endswith('pdf')]
        for download_link in pdf_links:
            if not download_link.endswith('supp.pdf'):
                try:
                    content = web.download(download_link)
                except:
                    print 'link : %s is obsolete' % download_link
                    continue
                f = open(download_link.split('/')[-1], 'wb')
                f.write(content)
                f.close()
        os.chdir('/home/sorkhei/Desktop/LDA-Papers/JMLR/Preceedings/')

Example #20

0

Show file

    def getStockPrice(self):
        '''
		Parses the webpage for the stock price 
		'''
        return bsoup(self.getPage(), "html.parser").find(
            'div', {
                'class': 'My(6px) Pos(r) smartphone_Mt(6px)'
            }).find('span').get_text()

Example #21

0

Show file

def exploit(url, payload):
    res = requests.post(url, data=payload, timeout=10)
    if res.status_code == 200:
        soup = bsoup(res.text, 'html.parser')
        print soup.p.string
        print "[+] Exploit Finished!"
    else:
        print "\n[!] Exploit Failed!"

Example #22

0

Show file

File: 58jobs.py Project: FYoungLee/Crawlers_Lix

def cook_categories(content):
    psoup = bsoup(content, 'lxml')
    urls = psoup.find('div', {'id': 'filterJob'}).find_all('li')
    urls = {
        x.text: x.find('a')['href']
        for x in urls if '全部' not in x.text and '招聘会' not in x.text
    }
    return urls

Example #23

0

Show file

    def SCRAPE_URL(
        self,
        SOURCE_URL="",  # URL to scrape
        URL_PARAMS=None,  # Additional params to pass with GET request
        PARA_MINLEN=15  # Paragraph elements < PARA_MINLEN characters will be ignored
    ):
        '''[BRIEF] : Function requests the source of the specified URL and scrapes the title, headers and paragraph elements of the HTML page
           [NOTE]  : All paragraphs < PARA_MINLEN characters in length are ignored ( default = 15 )
        '''

        # [STEP 1]: RAW CONTENT PROCUREMENT
        rawContent = self.MAKE_REQUEST(SOURCE_URL, URL_PARAMS)

        # [STEP 2]: USING PARSER TO GET HEADERS AND PARAGRAPHS
        # FIXME: For now, bs4 is the only supported parser backend
        if (self.PARSER_BACKEND == "bs4"):
            soup = bsoup(rawContent.content, 'html.parser')

            # STORE HEADERS AT EACH LEVEL
            for iHeaderLvl in range(self.HEADER_MAXDEPTH):
                headerKey = 'h' + str(iHeaderLvl)

                for item in soup.find_all(headerKey):
                    try:
                        item.get_text()
                        self.HEADERS[headerKey] = item.get_text()
                    except:
                        print('Unable to extract text from bs4 HEADER object')
                        logging.error(
                            'Unable to extract text from bs4 HEADER object')

            # STORE PARAGRAPHS
            for iPara in soup.find_all('p'):
                if (iPara.get_text()):
                    if (len(iPara.get_text().strip()) >= PARA_MINLEN):
                        self.PARAGRAPHS.append(iPara.get_text().strip())

# STORE IMAGE LINKS
            if (self.PARSE_IMAGES):
                soup = bsoup(rawContent.text, 'lxml')
                images = soup.find_all('img')
                self.IMAGE_LINKS = [link.get('src') for link in images]
                self.IMAGE_LINKS = [
                    valid_link for valid_link in self.IMAGE_LINKS if valid_link
                ]
        return

Example #24

0

Show file

def filterSoup(html):
    data_ = []
    page = bsoup(html)
    for link in page.findAll('a'):
        data_.append(link.get('href'))
    for link in page.findAll('form'):
        data_.append(link.get('action'))
    return data_

Example #25

0

Show file

 def translate(self, to, query):
     query = query.replace(" ", "+")
     url = "{}&sl=auto&tl={}&ie=UTF-8&prev=_m&q={}".format(
         self.host, to, query)
     r = requests.get(url)
     data = bsoup(r.content, 'lxml')
     result = data.findAll('div', {'class': 't0'})[0]
     return result.text

Example #26

0

Show file

    def recrawl_duplicate_page(self):
        # 对因为超时重复的page，重新爬取。

        asc_index = 0
        dup_key = 'icd_code'
        db = pymongo.MongoClient('mongodb://localhost:27017')['icd11']
        info = db['info']
        html = db['html']
        total_num = info.find().count()
        icd_code_dict = {}
        dup_idx_queue = queue.Queue(maxsize=0)
        for cursor in info.find({}, {dup_key: 1, 'foundation_id': 1}):
            if cursor[dup_key] == 'None':
                cursor[dup_key] += str(asc_index)
                asc_index += 1
            if cursor[dup_key] not in icd_code_dict.keys():
                icd_code_dict[cursor[dup_key]] = [cursor['foundation_id']]
            else:
                icd_code_dict[cursor[dup_key]].append(cursor['foundation_id'])
        for k in icd_code_dict:
            if len(icd_code_dict[k]) > 1:
                print('[INFO] %s with size %s, members: %s' %
                      (k, len(icd_code_dict[k]), ', '.join(icd_code_dict[k])))
                for e in icd_code_dict[k]:
                    dup_idx_queue.put(e)

        self.browser = webdriver.Safari()
        self.browser.implicitly_wait(5)
        self.browser.set_page_load_timeout(40)

        part_cnt = 1
        while dup_idx_queue.qsize() > 0:
            self.total_cnt += 1

            curr_url_idx = dup_idx_queue.get()
            try:
                part_cnt += 1
                curr_url = self.url_page_format + curr_url_idx

                self.browser.get(curr_url)
                time.sleep(5)
                page = self.browser.page_source

                page_sup = bsoup(page, features='html.parser')
                page_div = page_sup.find_all('div', {'id': 'firstright'})
                if len(page_div) > 0:
                    self.update_mongo_rawpage(curr_url_idx, str(page_div[0]),
                                              0, 0)
                else:
                    self.update_mongo_rawpage(curr_url_idx, page, 0, 0)
            except TimeoutException as time_exp:
                print('[ERROR] time out error with code : %s' % time_exp)
                dup_idx_queue.put(curr_url_idx)
            except Exception as e:
                dup_idx_queue.put(curr_url_idx)
                print('[ERROR] unexpected error occurs with code : %s' % e)
                traceback.print_exc()
        self.browser.close()

Example #27

0

Show file

File: search.py Project: tborychowski/whoogle-search

    def generate_response(self) -> Tuple[Any, int]:
        """Generates a response for the user's query

        Returns:
            Tuple[Any, int]: A tuple in the format (response, # of elements)
                             For example, in the case of a "feeling lucky"
                             search, the response is a result URL, with no
                             encrypted elements to account for. Otherwise, the
                             response is a BeautifulSoup response body, with
                             N encrypted elements to track before key regen.

        """
        mobile = 'Android' in self.user_agent or 'iPhone' in self.user_agent

        content_filter = Filter(self.session['fernet_keys'],
                                mobile=mobile,
                                config=self.config)
        full_query = gen_query(self.query, self.request_params, self.config,
                               content_filter.near)
        get_body = g.user_request.send(query=full_query)

        # Produce cleanable html soup from response
        html_soup = bsoup(content_filter.reskin(get_body.text), 'html.parser')
        html_soup.insert(
            0,
            bsoup(TOR_BANNER, 'html.parser')
            if g.user_request.tor_valid else bsoup('', 'html.parser'))

        if self.feeling_lucky:
            return get_first_link(html_soup), 0
        else:
            formatted_results = content_filter.clean(html_soup)

            # Append user config to all search links, if available
            param_str = ''.join(
                '&{}={}'.format(k, v)
                for k, v in self.request_params.to_dict(flat=True).items()
                if self.config.is_safe_key(k))
            for link in formatted_results.find_all('a', href=True):
                if 'search?' not in link['href'] or link['href'].index(
                        'search?') > 1:
                    continue
                link['href'] += param_str

            return formatted_results, content_filter.elements

Example #28

0

Show file

File: kariyer.py Project: PNGMerkez/KariyerCrawl

    def VeriverBabus(self):
        if self.adresVer.text() == "":
            self.adresVer.setPlaceholderText("Meslek dalı yazmalısınız..")
        else:
            self.adres = self.adresVer.text()
            if len(self.adres.split()) > 1:
                self.adres = "-".join(self.adres.split())
            print("Aranan : ", self.adres)
            link = "https://www.kariyer.net/is-ilanlari/" + self.adres
            header = {
                "User-Agent":
                "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
            }
            ilan_listesi = []
            kaynak_isle = requests.get(link, headers=header).text
            kod = bsoup(kaynak_isle, "lxml")

            for a in kod.find_all("a"):
                try:
                    a["href"]
                except KeyError:
                    pass
                else:
                    if a["href"].startswith("/is-ilani/"):
                        # self.gelenIlanlar.append("İş ilanı bulundu: "+ "https://kariyer.net"+ a["href"])
                        print("https://www.kariyer.net" + a["href"])
                        ilan_listesi.append("https://www.kariyer.net" +
                                            a["href"])
            if ilan_listesi:
                for link in ilan_listesi:
                    self.gelenIlanlar.append(link)
                    self.gelenIlanlar.append("")
                    kod = bsoup(
                        requests.get(link, headers=header).text, "lxml")
                    for konu in kod.find_all(
                            "div", attrs={"class": "genel-nitelikler"}):
                        for i in konu.find_all("li"):
                            self.gelenIlanlar.append(i.text)
                    self.gelenIlanlar.append("")
                    self.gelenIlanlar.append(
                        "*******************************************************************************************************************************************************************************************\n\n"
                    )
                    self.gelenIlanlar.repaint()
            else:
                self.gelenIlanlar.append("Bilgi Bulunamadı :(")

Example #29

0

Show file

    def get_appstatus(self):
        appstatus = {}
        for program in self.programs:
            r = self.s.get(program["Link"])
            appstatuspage = bsoup(r.content, "lxml")
            status = appstatuspage.find("a", {"name": "STATUS$0"}).get_text()
            appstatus[program["Program"]] = status

        return appstatus

Example #30

0

Show file

File: app.py Project: imeanYOLOright/nba-boxscore-generator

def get_game_html(home, away, date):
	doc = requests.get(GAME_URL.format(
		year=date.year,
		month=str(date.month).zfill(2),
		day=str(date.day).zfill(2),
		away=away,
		home=home,
	))
	return bsoup(doc.text)

Example #31

0

Show file

File: search.py Project: Samfun75/libgenesis

    async def __get_ids(self, url: str) -> list:

        r = self.__ses.get(url=url, allow_redirects=True)
        logg.debug(f'Requesting IDs page resulted in code: {r.status_code}')
        if r.status_code != 200:
            await Util().raise_error(
                r.status_code,
                str(r.reason) + ' - ' + str(bsoup(r.text, 'lxml').get_text()))

        soup = bsoup(r.content, 'lxml')
        for s in soup.findAll('script'):
            s.decompose()
        table = soup.find('table', attrs={'rules': 'rows'})

        ids = []
        for tr in table.findAll('tr')[1:]:
            ids.append(tr.td.get_text(strip=True))
        return ids

Example #32

0

Show file

File: scrap_music_info.py Project: Oh-Yoojin/Music-Recommendation

def get_song(index):
    song_list = []
    for x in index:
        url = get_url(x)
        html = get_html(url)
        soup = bsoup(html, 'html.parser')
        song = soup.select('.wrap_song_info')
        song_list += [x.text for x in song]
    return song_list

Example #33

0

Show file

File: reviews.py Project: dtemkin/pyreel2real

 def _proc_request(self):
     try:
         req = requests.get(self.url, headers=self.header)
     except Exception as errmsg:
         print("Err: %s" % errmsg)
         return False
     else:
         soup = bsoup(req.text, 'html5lib')
         return soup

Example #34

0

Show file

File: latimes.py Project: pavanchhatpar/ml-algorithms

def parse_docs():
    for file in filenames:
        
        if not file.startswith('la'):
            continue
        print(file)
        try:
            soup = bsoup(open(os.path.join(data_root,file)), 'html.parser')
        except UnicodeDecodeError:
            soup = bsoup(open(os.path.join(data_root,file), encoding='ISO-8859-15'), 'html.parser')
        docs = soup.find_all('doc')
        for doc in docs:
            # op_dict = {
            #     "index": {
            #         "_index": "trec", 
            #         "_type": TYPE, 
            #         "_id": doc.docno.get_text().strip()
            #     }
            # }
            # data_dict = {
            #     "text": doc.find('text').get_text().strip()
            # }
            # data.append(op_dict)
            # data.append(data_dict)

            try:
                id = doc.docno.extract().get_text().strip()
                try:
                    doc.docid.extract()
                except AttributeError:
                    pass
                try:
                    doc.length.extract()
                except AttributeError:
                    pass
                yield {
                    "_index": index,
                    "_op_type": "index",
                    "_type": TYPE,
                    "_id": id,
                    "text": re.sub('[\s\n]+', ' ', doc.get_text().strip())
                }
            except AttributeError:
                continue

Example #35

0

Show file

    def crawl_page(self):
        self.browser = webdriver.Safari()
        self.browser.implicitly_wait(5)
        self.browser.set_page_load_timeout(40)

        self.init_html_url_queue()
        part_cnt = 1
        while self.url_queue.qsize() > 0:
            self.total_cnt += 1
            if part_cnt % 500 == 0:
                print('[INFO] refreshing browser to collapse tree...')
                self.browser.refresh()
                time.sleep(10)

            curr_url_ele = self.url_queue.get()
            curr_url_idx = curr_url_ele['idx']
            curr_url_bth = curr_url_ele['breath']
            curr_url_lef = curr_url_ele['leaf']
            if self.col.find({'idx': curr_url_idx}).count() != 0:
                print('[INFO] %sth\t%s already exists in mongodb' %
                      (self.total_cnt, curr_url_idx))
                continue
            try:
                part_cnt += 1
                curr_url = self.url_page_format + curr_url_idx
                # print('[INFO] --- start crawling with url %s ------->' % curr_url)
                self.browser.get(curr_url)
                time.sleep(3)
                page = self.browser.page_source

                # curr_idx = self.crawled_idx_pattern.findall(page)[0]
                # if curr_idx in self.crawled_idx:
                # 	print('[ERROR] %s occurs again, probably due to page frozen, sleep for a while' % curr_idx)
                # 	self.url_queue.put(curr_url_ele)
                # 	self.browser.refresh()
                # 	time.sleep(10)
                # 	continue
                # self.crawled_idx.add(curr_idx)

                page_sup = bsoup(page, features='html.parser')
                page_div = page_sup.find_all('div', {'id': 'firstright'})
                if len(page_div) > 0:
                    self.save_mongo_rawpage(curr_url_idx, str(page_div[0]),
                                            curr_url_bth, curr_url_lef)
                else:
                    self.save_mongo_rawpage(curr_url_idx, page, curr_url_bth,
                                            curr_url_lef)
            except TimeoutException as time_exp:
                print('[ERROR] time out error with code : %s' % time_exp)
                self.url_queue.put(curr_url_ele)
            except Exception as e:
                self.url_queue.put(curr_url_ele)
                print('[ERROR] unexpected error occurs with code : %s' % e)
                traceback.print_exc()
                # break
        self.browser.close()

Example #36

0

Show file

File: api.py Project: rushkii/QuotesMaker

 async def topics(self):
     async with aiohttp.ClientSession() as ses:
         async with ses.get(self.base_url) as res:
             s = bsoup(await res.text(), 'lxml')
             return [{
                 "name":
                 a.find("span", class_='_1WMnM xLon9').text,
                 "url":
                 f"{self.base_url}{a.find('a', class_='qvEaq')['href']}"
             } for a in s.find_all('li', class_='_1hkdt')]

Example #37

0

Show file

def clean(html_str):
    souped = bsoup(html_str.strip(), 'html.parser')

    temp = []
    for element in souped.find_all(True, recursive=False):
        temp.append(flatten(element))

    result = ' '.join(temp)
    result = whitespaces_matcher.sub(' ', result)
    return result

Example #38

0

Show file

def scrape(path: str):
    if bool(re.match(r'((http|https):\/\/)?(www.)?maid.my.id/(.*)', path)):
        url = path
    elif not path.startswith('/'):
        url = f'{BASE}/{path}'
    else:
        url = BASE + path
    response = requests.get(url)
    soup = bsoup(response.text, 'lxml')
    return soup

Example #39

0

Show file

File: wbdatarequest.py Project: R-a-y-Zhang/askIGO

def get_topics ():
	tList = []
	r = requests.get('{}/{}'.format(BASE_URL, 'topic'))
	soup = bsoup(r.text, 'lxml')
	nav_footer = soup.find(id='block-views-topics-block_2')
	table = nav_footer.find('table', {'class' : 'views-view-grid grid-2'})
	topics = table.findAll('a')
	for i in topics:
		tList.append(i.text)
	return tList

Example #40

0

Show file

File: sitemap.py Project: Hexclass/HydraRecon

def xml_parse(domain):
    try:
        sitemap_xml = bsoup(get_sitemap(domain), 'lxml')
        urls = sitemap_xml.find_all('loc')
        for url in urls:
            yield url.string
    except Exception:
        pass
    except KeyboardInterrupt:
        exit('Bye!')

Example #41

0

Show file

File: tapi.py Project: etc-rc6/t-arch-working

def handle_readmore(text_body, text_body_attrs):
	body_html = bsoup(text_body)
	readmores = body_html.findAll('a', class_='read_more')	
	try:
		for flag in readmores:
			user_host_name, pid = re.search(r'\://([^/]+)/post/(\d+)', flag['href']).groups()
			print user_host_name, pid
			try:
				post = get_post(user_host_name, pid)
			except:
				continue
			replacement = bsoup( post[text_body_attrs[post['type']]] )
			replacement.find('html').unwrap()
			replacement.find('body').unwrap()
			flag.replace_with(replacement)
		return str(body_html)
	except:
		errors.exception('Readmore fail:\n%s', text_body)
		return text_body

Example #42

0

Show file

File: TopicModeler.py Project: dkalamar/TopicModeler

 def _read_text(self, path):
     try:
         page = bsoup(open(path, 'rb').read(), 'html.parser')
         text = [
             div.text for div in page.find('div', class_='main').find_all(
                 'div')[1:-2]
         ]
         return re.sub(' [0-9]+\\xa0', '', ''.join(text))
     except:
         pass

Example #43

0

Show file

File: proxier.py Project: ikonnikov/crypad

    def parse(self):
        r = requests.get(self._proxy_url)

        soup = bsoup(r.content, u'html.parser')
        proxies_tag = soup.find(
            u'table',
            class_=u'htable proxylist').find(u'tbody').find_all(u'tr')

        for proxy_tag in proxies_tag:
            self.proxies_list.append(proxy_tag.find_all(u'td')[0].text.strip())

Example #44

0

Show file

    def soupify(self, response):
        """
		Uses BeautifulSoup4 to return the HTML content
		of the specified HTTP response, as obtained
		using the requests module. "response" is not
		a web address, but an already pinged response
		object generated from an address.
		"""
        print('souping')
        return bsoup(response.content, 'html.parser')

Example #45

0

Show file

File: MALPageScrape.py Project: eamorris94/mal-scraper

    def CheckSequel(self, url):
        previous_score = ''
        previous_pop = ''
        isSequel = 'False'

        search_url = requests.get(
            url,
            headers={
                "User-Agent":
                "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36"
            }).content

        new_html = bsoup(search_url, "lxml")

        old_start = GetText(self.html, 'Aired:')
        old_start = old_start.split(' to ')[0]

        new_start = GetText(new_html, 'Aired:')
        new_start = new_start.split(' to ')[0]

        try:
            if date.strptime(old_start, '%b %d, %Y') > date.strptime(
                    new_start, '%b %d, %Y'):
                isSequel = 'True'
                #Get previous show popularity and rating
                initial = new_html.find('span',
                                        attrs={'itemprop': 'ratingValue'})
                previous_score = initial.get_text()

                previous_pop = GetText(new_html, 'Members:')

        except ValueError:
            try:
                if date.strptime(old_start, '%b %d, %Y') > date.strptime(
                        new_start, '%Y'):
                    isSequel = 'True'
                    #Get previous show popularity and rating
                    initial = new_html.find('span',
                                            attrs={'itemprop': 'ratingValue'})
                    previous_score = initial.get_text()

                    previous_pop = GetText(new_html, 'Members:')

            except ValueError:
                if date.strptime(old_start, '%Y') > date.strptime(
                        new_start, '%b %d, %Y'):
                    isSequel = 'True'
                    #Get previous show popularity and rating
                    initial = new_html.find('span',
                                            attrs={'itemprop': 'ratingValue'})
                    previous_score = initial.get_text()

                    previous_pop = GetText(new_html, 'Members:')

        return [isSequel, previous_score, previous_pop]

Example #46

0

Show file

File: parseGenerated.py Project: robograder/robogen

def _getPostmodernSentences(url = 'http://dev.null.org/postmodern/'):
    soup = bsoup(urllib2.urlopen(url, 'html5lib').read())
    sentences = []

    for tag in soup.body.children: # read all <p> tags up to the first <hr>
        if tag.name == 'p':
            sentences.extend([_parsePostmodern(s) for s in splitSentences(tag.get_text()) if len(s) > 1])
        if tag.name == 'hr':
            break

    return sentences

Example #47

0

Show file

File: app.py Project: imeanYOLOright/nba-boxscore-generator

def get_todays_games(date):
	games = []
	doc = bsoup(requests.get(GAMELINE_URL.format(
		year=date.year,
		month=str(date.month).zfill(2),
		day=str(date.day).zfill(2),
	)).text)

	divs = doc.find(id="nbaSSOuter").find_all("div", class_="nbaModTopScore")

	return [get_game(g) for g in divs]

Example #48

0

Show file

File: GenderMe.py Project: Kenishi/FilterMe

def isFlood(res):
	soup = bsoup(res.text)
	sel = soup.select(".messageArea h3")
	if sel:
		sel = sel[0].decode_contents()
		if sel == u'内容確認' and u'存在しないユーザIDです' not in res.text:
			return True
		else:
			return False
	else:
		return False

Example #49

0

Show file

File: webview.py Project: Pliny/digiscrape

    def add_row(self, local_pn, item):
        row = self.__outsoup.new_tag("tr")

        # Local PN
        col = self.__outsoup.new_tag("td")
        col.string = local_pn
        row.append(col)

        # Digikey PN
        col = self.__outsoup.new_tag("td")
        search_link = self.__outsoup.new_tag("a", href=item['search_url'], target="_blank")
        search_link.string = item['digikey_pn']
        col.append(search_link)
        row.append(col)

        # Pricing information
        col = self.__outsoup.new_tag("td")
        table = ("<table class=\"table table-hover\"border=\"1\"><tbody><tr><th>Price Break</th>"
                "<th>Unit Price</th></tr></tbody></table>")
        table = bsoup(table, 'html.parser')

        table.tbody.append(self.__row_for(item['pricing']['min']))
        table.tbody.append(self.__row_for(item['pricing']['max']))
        col.append(table)
        row.append(col)

        # Image
        col = self.__outsoup.new_tag("td")
        if(item.has_image_url()):
            img_title = self.__outsoup.new_tag("div")
            img_title.string = "NAME: " + self.__get_basename_from_url(item['image_url'])
            col.append(img_title)
            img_div = self.__outsoup.new_tag("div")
            img_link = self.__outsoup.new_tag("a", href=item['image_url'], target="_blank")
            img_tag = self.__outsoup.new_tag("img", border="0", width="100", src=item['image_url'])
            img_link.append(img_tag)
            img_div.append(img_link)
            col.append(img_div)
        else:
            col.string = "N/A"
        row.append(col)

        # Datasheets
        col = self.__outsoup.new_tag("td")
        if(item.has_datasheet_urls()):
            for url in item['datasheet_urls']:
                link = self.__outsoup.new_tag("a", href=url, target="_blank")
                link.string = self.__get_basename_from_url(url)
                col.append(link)
        else:
            col.string = "N/A"
        row.append(col)

        self.__outsoup.find(class_="table").append(row)

Example #50

0

Show file

File: cron.py Project: headlineapp/webservice

    def pull_title_and_images(self):
        filter_query = Q(url_title=None) | Q(url_title='') | Q(url_image=None) | Q(url_image='')
        url_without_title = News.objects.filter(filter_query).values_list('url', flat=True)

        for url in url_without_title:
            try:
                headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36'}
                response = requests.get(url, headers=headers)
            except requests.exceptions.ConnectionError as e:
                response = None

            if response:
                soup = bsoup(response.text, "html.parser")

                url_title = None
                if soup.title:
                    url_title = soup.title.string

                url_description = None
                twitter_description = soup.find('meta', attrs={'property': 'twitter:description', 'content': True})
                og_description = soup.find('meta', attrs={'property': 'og:description', 'content': True})
                meta_description = soup.find('meta', attrs={'name': 'description', 'content': True})
                if twitter_description:
                    url_description = twitter_description['content']
                elif og_description:
                    url_description = og_description['content']
                elif meta_description:
                    url_description = meta_description['content']

                url_image = None
                twitter_image = soup.find('meta', attrs={'property': 'twitter:image:src', 'content': True})
                og_image = soup.find('meta', attrs={'property': 'og:image', 'content': True})
                if twitter_image:
                    url_image = twitter_image['content']
                elif og_image:
                    url_image = og_image['content']
                else:
                    images = soup.find_all('img')
                    url_image = get_biggest_images(images)

                news_is_exist = News.objects.filter(url=url).exists()
                if news_is_exist:
                    news = News.objects.get(url=url)
                    if url_title and url_description:
                        if news.channel:
                            url_title = url_title.replace(' - %s' % news.channel.name, '')
                            url_title = url_title.replace(' | %s' % news.channel.name, '')
                        news.url_title = url_title
                        news.url_description = url_description
                        news.save()
                        News.objects.filter(url=url, url_image__isnull=True).update(url_image=url_image)
                    else:
                        news.delete()

Example #51

0

Show file

File: GenderMe.py Project: Kenishi/FilterMe

def isRestricted(res):
	soup = bsoup(res.text)
	sel = soup.select(".messageArea h3")
	if sel:
		sel = sel[0].decode_contents()
		if sel == u'アクセスできません':
			return True
		else:
			print "Restricted?: %s" % (sel,)
			return False
	else:
		return False

Example #52

0

Show file

File: emoji_utils.py Project: dotslash/yesteapea.com

def search_emojipedia(query):
    html = cache.get(query)
    if html is None:
        url = u'http://emojipedia.org/search/?q={}'.format(query)
        r = requests.get(url)
        html = r.text
        cache.set(query, html)
    soup = bsoup(html, 'lxml')
    results = soup.find('ol', class_='search-results')
    results = results.find_all('a')
    results = [result.text.replace(' ', '  ', 1) for result in results if result is not None]
    return results

Example #53

0

Show file

File: parse_espn_nba.py Project: jreiher2003/espn_parse

def parse_espn(name):
    url = "http://espn.go.com/nba/team/schedule/_/name/" + name
    r = rq.get(url)
    soup = bsoup(r.content, 'lxml')


    ## Find all the rows that have classes. Remove the first one -- it's irrelevant.
    trs = soup.find_all("tr", class_=True)[1:]
    ## Main procedure.

    with open("nba_2016_csv/" + name + "_2016_schedule.csv", "wb") as ofile:
        f = csv.writer(ofile)
        ## Write the headers. 
        f.writerow( ["team","date","a/h","opponent","w/l","s","os"] )
        for tr in trs:
            team = name
            tds = tr.find_all("td")
            date = tds[0].get_text().encode("utf-8")
            opp = tds[1].find_all( "li", {"class": "team-name"} )
            for teams in opp:
                other_team = teams.get_text()

            opponent = tds[1].find_all( 'li',{'class':'game-status'} )
            for o in opponent:
                h_a = o.get_text()
                try:
                    win_loss = tds[2].find_all( 'li',{'class':'game-status win'} ) or tds[2].find_all( 'li',{'class':'game-status loss'} )
                    for a in win_loss:
                        w_l = a.get_text()
                    score = tds[2].find_all('a')
                    for s in score:
                        if w_l == 'W':
                            gs = s.get_text().split("-")[0]
                            gs = int(gs)
                            ogs = s.get_text().split("-")[1].split(" ")[0]
                            ogs = int(ogs)                            
                        elif w_l == 'L':
                            gs = s.get_text().split("-")[1].split(" ")[0]
                            gs = int(gs)
                            ogs = s.get_text().split("-")[0]
                            ogs = int(ogs)
                except:
                        h_a = ''
                        w_l = None
                        other_team = ''
                        gs = None
                        ogs = None
            ## write the result to the CSV file.
                finally:
                    f.writerow([team,date,h_a,other_team,w_l,gs,ogs])
                    g = dict(date=date, team=team,h_a=h_a,other_team=other_team, w_l=w_l,s=gs,os=ogs)
                    connect().save(g)

Example #54

0

Show file

File: wbdatarequest.py Project: R-a-y-Zhang/askIGO

def get_indicator_code (ind):
	if not ind:
		return None
	iCodes = dict()
	r = requests.get('{}/{}'.format(BASE_URL, 'indicator'))
	soup = bsoup(r.text, 'lxml')
	tables = soup.findAll('tbody')
	for table in tables:
		links = table.findAll('a')
		for link in links:
			if link.text in ind:
				iCodes[link.text] = link.get('href').split('/')[-1]
	return iCodes

Example #55

0

Show file

File: di_utilities.py Project: Jiangliuer/rl_trading

 def _getFromWeb(self, s_date, s_filter='DI1'):
     '''
     Return a dataframe of the data of specific data
     '''
     # recupera dados de website
     url = self.url + s_date
     resp = requests.get(url)
     ofile = resp.text
     soup = bsoup(ofile, 'lxml')
     soup.prettify()
     tables = soup.find_all('table')
     storeValueRows = tables[6].find_all('tr')
     # checa re retrornou valores
     if len(storeValueRows) == 2:
         return None
     # separa informacoes de interesse
     storeMatrix = []
     s_ass = ''
     for row in storeValueRows:
         storeMatrixRow = []
         for cell in row.find_all('td'):
             s = cell.get_text().strip()
             if s != '':
                 storeMatrixRow.append(s)
         if len(storeMatrixRow) == 6:
             s_ass = storeMatrixRow[0].split()[0]
             storeMatrixRow = [s_ass] + storeMatrixRow[1:]
         elif len(storeMatrixRow) == 5:
             storeMatrixRow = [s_ass] + storeMatrixRow
         storeMatrix.append(storeMatrixRow)
     # monta dataframe com dados filtrados
     df = pd.DataFrame(storeMatrix[1:], columns=storeMatrix[0])
     if s_filter:
         df = df[df.Mercadoria == s_filter].reset_index(drop=True)
     df = df.ix[:, :-2]
     df.index = [list(df.Mercadoria + df.Vct), [s_date]*df.shape[0]]
     df.drop([u'Mercadoria', u'Vct'], axis=1, inplace=True)
     # transforma dados em dicionario
     d_rtn = {}
     d_aux = df.T.to_dict()
     for x in d_aux:
         if x[0] not in d_rtn:
             d_rtn[x[0]] = {}
         s_atual = d_aux[x][u'Pre\xe7o de Ajuste Atual']
         s_anterior = d_aux[x][u'Pre\xe7o de Ajuste Anterior']
         s_atual = s_atual.replace('.', '').replace(',', '.')
         s_anterior = s_anterior.replace('.', '').replace(',', '.')
         d_rtn[x[0]][x[1]] = {'PU_Anterior': float(s_anterior),
                              'PU_Atual': float(s_atual)}
     return d_rtn

Example #56

0

Show file

File: goo_result.py Project: 0xKD/GooDork3-dev

def parseHTML(html):#parse a google page into result objects
	#netlib calls this before return its results
	results = [] #list of result objects
	if html==None:
		#then work with `self.gooHTML'
		return []
	else:
		#use `html'
		#okay off to the lab, gotta whip up a small script to get this going
		#need to check that the ires div is available on the page
		resDiv = bsoup(html).find('div',{'id':'ires'})
		resultTags = ""
		try:
			if resDiv.ol:
				#we know that the ol tag is available
				resultTags = resDiv.ol.findAll('li',{'class':'g'})
			if len(resultTags) > 0:
				for result in resultTags:
					resultObj = goo_result(None,None,None,None,None)
					#a couple of things could be in each result tag
					#just realized GooDork would make an awesome browser extension!
					if result.blockquote: #sometimes results are grouped together using this tag
						result=result.blockquote
					if result.h3: #the link to the actual page sits in this tag, with the title of the page
						h3=result.h3
						if h3.a:
							href=h3.a.get('href')
							href=str(href[7:].split("&sa=")[0])
							title=''.join([string for string in h3.a.strings])
							resultObj.setURL(href)
							resultObj.setTitle(title)
					if result.div: #some extra's, possibly containing the cache link, image or summary	
							summary=''.join([string for string in result.div.strings])
							resultObj.setSummary(summary)
							if result.div.div: #contains the cache link
								cached=result.div.div
								if cached.cite:
									citeURL=''.join([ string for string in cached.cite.strings ])
									resultObj.setCiteURL(citeURL)
								if cached.span:
									if cached.span.a:
										#here lies the cached link
										cacheLink=cached.span.a.get('href')
										resultObj.setCacheLink(cacheLink[2:])
					results.append(resultObj)
				return results
			else:
				return [] #no results could be found
		except Exception, e:
				raise Exception("\n\t\t[goo_result] Problem parsing Google Search page:\n\t\t%s" % (e))

Example #57

0

Show file

File: crawler.py Project: topher515/crawlit

def do_crawl(job_id, depth, url):
    """
    Perform a new crawl task for the given `job_id` and `url`.

    If `depth > 0` then enqueue additional crawl tasks for each
    valid `<a href=""></a>` tag encountered at this URL.

    NOTE: This works purely on static HTML. No Javascript gets run!
    """

    log("Starting crawl (job_id='%s' depth='%s' url='%s')" % (
            job_id, depth, url))

    # Increment inprogress
    r.incr("JOB_%s_INPROGRESS" % job_id)

    try:
        # Get image urls
        page = requests.get(url).content

        html = bsoup(page)

        # Push all img srcs to database
        for img_tag in html.find_all('img'):
            if not img_tag.get("src"):
                # Skip images with empty src attrs
                continue 
            r.sadd("JOB_%s_RESULTS" % job_id, img_tag["src"])

        # If we should go deeper, enqueue more crawls 
        if depth > 0:
            for a_tag in html.find_all("a"):
                href = a_tag.get("href","")
                if not href or href.startswith("javascript"):
                    continue
                # Build full url
                full_url = urljoin(url, href)
                # Enqueue a crawl for this job for this url, decrementing depth counter
                r.rpush('CRAWL_QUEUE', "%s$%s$%s" % (job_id, depth - 1, full_url))
    
    except requests.exceptions.SSLError:
        warn("SSL Error: Skipping url '%s'")

    finally:
        # Always decrement inprogress
        r.decr("JOB_%s_INPROGRESS" % job_id)

    # Increment completed
    r.incr("JOB_%s_COMPLETED" % job_id)

Example #58

0

Show file

File: check_bigtiff.py Project: gakarak/Camelyon2016Code

 def loadImage(self, parFnImg):
     if os.path.isfile(parFnImg):
         self.dataImg=ops.OpenSlide(parFnImg)
         xmlInfo=bsoup(self.dataImg.properties['tiff.ImageDescription'],'lxml-xml')
         lstXMLObj=xmlInfo.find_all("DataObject",  ObjectType="PixelDataRepresentation")
         arrSizesMm=np.zeros(len(lstXMLObj), np.float)
         for i,ii in enumerate(lstXMLObj):
             tmp=ii.find_all("Attribute", Name="DICOM_PIXEL_SPACING")[0]
             tsiz=float(tmp.getText().split(" ")[0].replace('"',''))
             tidx=int(ii.find_all("Attribute", Name="PIIM_PIXEL_DATA_REPRESENTATION_NUMBER")[0].getText())
             print i, " : ", tidx, " : ", tsiz
             arrSizesMm[tidx]=tsiz
         self.realScales=np.round(arrSizesMm/arrSizesMm[0])
         arrLayerSizes=np.array(self.dataImg.level_dimensions)
         self.layerSizes=np.array([(arrLayerSizes[0][0]/ss, arrLayerSizes[0][1]/ss) for ss in self.realScales], np.int)
         self.numLevels=self.dataImg.level_count

Example #59

0

Show file

File: amara_vids.py Project: aerenchyma/amara_code-opened

	def manage_remainder_links(self):
		for ak in self.acctobjs:
			for i in range(1,ak.num_acct_pages()):
				burl = ak.base + "?page={}".format(i)
				rb = requests.get(burl).text
				sp = bsoup(rb)
				lks = sp.findAll('a')
				for l in lks:
					tst = re.search(self.vid_patt, l['href'])
					if tst is not None:
						ed = tst.groups()[0]
						if ed not in self.ids:
							self.ids[ed] = 1
						else: continue
					else:
						continue

Example #60

0

Show file

File: start_diff_test.py Project: 0xKD/GooDork3-dev

def results(html):
	res_wrap = None
	res_wrap = bsoup(html).find('div',{'id':'ires'})
	if not (res_wrap):
		raise Exception('Could not parse file')
	if len(res_wrap)==1:
		if res_wrap.ol:
			oltag = res_wrap.ol
			gresults=oltag.findAll('li',{'class':'g'})
			#print type(gresults) #ResultSet
			if len(gresults)>=1:
				return gresults
		else:
			raise Exception('No results found!')
	else:
		raise Exception('No results found!')
	return