Python read Examples, lxml.html.read Python Examples

Example #1

0

Show file

File: mine_prod.py Project: manikTharaka/Autolanka_scraper

def getDomFromFile(url):
    #html = requests.get("http://www.autolanka.com/Buy.asp").content
    html=open('index'+str(url),'r')
    dom = lxml.html.fromstring(html.read())
    #data=minePage(dom)

    return dom

Example #2

0

Show file

    def get_email(self, url, search_word):
        self._clear_variable()

        if url == 'nan':
            self.emails = ''
            return

        if fnmatch.fnmatch(url, '*.txt') or fnmatch.fnmatch(url, '*.pdf'):
            self.emails = ''
            return

        try:
            html = urlopen(url)
            soup = BeautifulSoup(html.read(), "lxml")
            email = soup.find_all(string=re.compile(search_word))
            self._set_emails(email)
            if len(self.emails) > 0:
                print('{}という文字列を発見しました。\nemails: {}'.format(
                    search_word, self.emails))
            else:
                print('{}を含む文字列は見つかりませんでした。'.format(search_word))

        except urllib.error.HTTPError as e:
            print(e)
            if e.code == 403:
                self.emails = None
            else:
                self.emails = ''

Example #3

0

Show file

def download(url,num_retries=2,headers={'User_agent':'wswp'}):
    print 'Downloading:'+url
    headers=headers
    # headers={
    #    'cookie':'ali_apache_id=10.181.239.59.1494788661790.629693.9; ali_beacon_id=10.181.239.59.1494788661790.629693.9; __utma=3375712.567060824.1494788650.1494788798.1494788798.1; __utmc=3375712; __utmz=3375712.1494788798.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); JSESSIONID=FF6Y4EE1N2-GOQKM0V4V8OX81KS4MMQ3-TP303P2J-SNN; _mle_tmp0=eNrz4A12DQ729PeL9%2FV3cfUxiK7OTLFScnMzizRxdTX0M9J19w%2F09jUIMwmz8I%2BwMPQONvH1DTTWDQkwNjAOMPLSDfbzU9JJLrEyNLE0MbewNLYwMDUz1ElMRhPIrbAyqI0CAE1jHIM%3D; aep_history=keywords%5E%0Akeywords%09%0A%0Aproduct_selloffer%5E%0Aproduct_selloffer%0932765891920%0932659135793; aep_common_f=lloB6SH9JkijCM0tZf5kF3vyLv3h17v3UGN0yKJY3eGouhvEFBCS8A==; cna=7TiaEawi8UUCAdz5Y7Jfecem; acs_usuc_t=acs_rt=9fa376ab0d934f49897582cda74a6a48&x_csrf=1dhm5ajlu9pss; xman_us_t=x_lid=cn1520615299vyjy&sign=y&x_user=sY7/8Mu/TJ74CnXJLTJLG+uzUZKMe5Udm53rpggEULQ=&ctoken=hku5awqv39rt&need_popup=y&l_source=aliexpress; xman_f=r9aP0o7m4kQFmXkQnDDVvHtloUJDl4TJtul01V5pE/TGopjb7kv3ERmNWw/bl4AkL2PUKsbtm0P9uzA1VpN9yTOLXjsCP492Tp0lMn+dZp6lLhPTMTEPlFPB9+Df+xVeZnFC4bm1nRL4VukJ85ff5E6t6GRqQNKTT+rFlACpAt8Xml6pQqUVNwXg2DwEYaUKhuQuJtUCJguhpCa9xCyXT4MffiUY7ExmZH0NG4eesAgpds2lCBCmS6GKkR1JRECrRYDFGYnVYO72CIy4so0cERX0HEVsdrCvu9pBrBOsnUxDLXCvUQWUbfIeT/Pf+rJT49UCdnH0DKQZftBBaUmJL0ZpSZc/5p9+RN8ZPGXplbRkFPIqThB/THcutxz5OKb8K5mSRANDbAR6utSxkYdfPw==; _ga=GA1.2.567060824.1494788650; _gid=GA1.2.511798973.1494791972; _gat=1; xman_us_f=x_l=1&x_locale=en_US&no_popup_today=n&x_user=CN|zheng|quanshi|cnfm|230715779&zero_order=y&last_popup_time=1494791355822; aep_usuc_f=site=glo&region=US&b_locale=en_US&iss=y&s_locale=zh_CN&isfm=y&x_alimid=230715779&c_tp=USD; intl_locale=en_US; intl_common_forever=y79FmkBQVBQqJv1LtS4vunejUJ855apY0IOmvml/PRSXt6uEoU8RZg==; l=Ajk50FxeJA3yhOmq8aD8iPi8ya4RcS34; isg=AvPzpm8uroKGwmLWP-mLS0CTgvd5y4fq6arxoKWSeJJJpBNGLPgXOlE-KGKx; ali_apache_track=mt=2|ms=|mid=cn1520615299vyjy; ali_apache_tracktmp=W_signed=Y; xman_t=oWB8qjX+m/FChYzepnxuxryFhmGnbqQ023tWzmrFPg31C97Flxcq69qPSvczwM3a+vYgGwjxlyDEUqr1uQKvfSk2yxYTlIXjrfq1qKduCQqLIiofcEw0m34tbPSH0b25clkG0+uN3pj+pI7GcStuSq60x5OEmwFYzxrucHqx+Lw6rdJo6C6cMWZNa98KFo7mVIv9FDorv/rLbURUmXcRtKpzakFP1PQQuM69/LXfW9eltTejIU4ssITXciaL7JBxU0DkVvIdop4ZFwLG9P6TA8CUQb7m3MSFvhW/zdgztKyg7ZgIHUh6+p5FRdLOA3UKSB3+kTw+pQJ4xr1tCTCKBHzTuPCa7RgITEUR7n7LE67o5FtMOy6EZmoZZggReQG5Vo89imAnwnLlsvdjezMswKh87jvo66bJQ423xlN/yUb2n8kO+yeAs/0FeSXltceWR1R50qFg2HO8cEz+QChALI5KC7rzHXLZPSXDt7EzvmoIuKo8UI6Db4Iuefl0t0UhRonofgoh3meoZFidDI2m3ZwfMBTka0hbRoK/fWNvW9FOeQTOzYpBtoFRCd1x1aNmzuF+i6lMvH32E57KJXo/dN4aNF5O6ZDSnMW7hZe75T9pWX1MNIiFmGget/NY3Cx96tgwr7ekv2vseN+4HIo20a5027PyOIoq',
    # }
    request=urllib2.Request(url,headers=headers)
    try:
        response=urllib2.urlopen(request)
        html=urllib2.urlopen(request).read()
        if response.info().get('Content-Encoding')=='gzip':
            html=gzip.GzipFile(fileobj=StringIO.StringIO(html),mode="r")
            try:
                html=html.read()   #.decode('gbk').encode('utf-8')
            except IOError as e1:
                html=urllib2.urlopen(request).read()    #针对 amazon 返回信息 时有错误
    except urllib2.URLError as e:
        print 'Downloading error:',e.reason
        html=None
        if num_retries > 0:
            if hasattr(e,'code') and 500<=e.code<600:
                return download(url,num_retries-1,headers)
    
    #print html
    return html

Example #4

0

Show file

def match_walletID_bitaddr(ID_txhash, address_type):
    global idx
    socket.setdefaulttimeout(3)
    for walletId in ID_txhash.keys():
        idx += 1
        print(idx)
        try:
            txhashes = ID_txhash[walletId]
        except Exception as e:
            continue

        for txhash in txhashes:
            try:
                request = urllib.request.urlopen('http://www.qukuai.com/search/zh-CN/BTC/' + txhash)
                html = request.read()
                request.close()
                address = get_address(html, address_type)
                # print('method2 ', walletId, address)
            except Exception as e:
                try:
                    html = urllib.request.urlopen('https://blockchain.info/rawtx/' + txhash)
                    hjson = json.loads(html.read())
                    address = parse_transaction(hjson, address_type)
                    # print('method1 ', address)
                except Exception as e:
                    print('get address failed')
                    continue

            if walletId not in walletId_bitaddr:
                # print('1 ', walletId, address)
                walletId_bitaddr[walletId] = address
            else:
                # print( 'not 1 ', walletId, address)
                walletId_bitaddr[walletId].extend(address)

Example #5

0

Show file

File: afl_2012.py Project: yuandra/scraperwiki-scraper-vault

def processRounds(roundURLs):
    for roundURL in roundURLs:
        html = urllib2.urlopen(siteURL + roundURL)
        roundPage = lxml.html.fromstring(html.read())
        html.close()

        round = roundPage.cssselect(
            "li[id='tpRound'] a")[0].text_content().replace(
                "round ", "").replace(" Rankings", "").strip()
        print "Round: " + round

        roundRows = roundPage.cssselect("div[id='view_standard'] tr")
        # specified in the footer
        pageLinks = roundRows[-1].cssselect("a")

        #remove the "next page" link
        del pageLinks[-1]

        for link in pageLinks:
            linkURL = siteURL + link.get("href")
            print linkURL

            scrapePage(linkURL, round)

        calculateExtraStats(round)

Example #6

0

Show file

File: gettile.py Project: kongxinling/jiaoben

def gettitle(url):

    requests.packages.urllib3.disable_warnings()
    req = request.Request(url)
    try:
        re = request.urlopen(req)
        html = urlopen(url)
        # 解析返回包的内容
        #捕获异常，目标标签在网页中缺失
        try:
            soup = BeautifulSoup(html.read(), 'lxml')
            title = soup.title.text
            tfw = open("title.txt", "a")
            tfw.write(str(soup.title.text) + "\n")
            tfw.close()
            ufw = open("url.txt", "a")
            ufw.write(str(re.url) + "\n")
            ufw.close()
            # 要加close不然无法写入
        except AttributeError as e:
            print(url + " " + "no title")
            efw = open("eception.txt", "a")
            efw.write(url + " no tile" + "\n")
    except error.HTTPError as e:
        print(e.code)
        efw = open("eception.txt", "a")
        efw.write(url + " " + str(e.code) + "\n")
    except error.URLError as e:
        print(e.reason)
        efw = open("eception.txt", "a")
        efw.write(url + " " + str(e.reason) + "\n")

Example #7

0

Show file

File: stm_metro_montreal.py Project: hk909/monitoring-tools

 def get_html(self):
     try:
         html = urllib2.urlopen(URL)
     except Exception as e:
         self.exit(STATES.UNKNOWN, 'Error while opening url: %s' % str(e))
     if html.getcode() >= 400:
         self.exit(STATES.UNKNOWN, 'HTTP error: %d' % html.getcode())
     return html.read()

Example #8

0

Show file

File: stm_metro_montreal.py Project: savoirfairelinux/monitoring-tools

 def get_html(self):
     try:
         html = urllib2.urlopen(URL)
     except Exception as e:
         self.exit(STATES.UNKNOWN, 'Error while opening url: %s' % str(e))
     if html.getcode() >= 400:
         self.exit(STATES.UNKNOWN, 'HTTP error: %d' % html.getcode())
     return html.read()

Example #9

0

Show file

File: spider_zhenzhou.py Project: niejn/selenium_test

def test1():
    j = json.loads('{"one" : "1", "two" : "2", "three" : "3"}')

    html = urlopen(
        "http://www.czce.com.cn/portal/DFSStaticFiles/Future/2017/20171026/FutureDataDaily.xls"
    )
    data = html.read()
    print(data)
    return

Example #10

0

Show file

 def getHtml(self, url):
     if (self.testUrl(url) is True):
         html = urllib.request.urlopen(url)
         mybytes = html.read()
         mystr = mybytes.decode("utf8")
         html.close()
         return mystr
     else:
         return None

Example #11

0

Show file

File: diannaobao.py Project: ClericPy/pyld

def yuandaima(ss):
    url = ss
    headers1 = {'GET': url,
                'Host': "www.icpcw.com",
                'User-Agent': "Mozilla/5.0 (Windows NT 6.2; rv:28.0) Gecko/20100101 Firefox/28.0",
                'Referer': url}
    req = urllib.request.Request(url, headers=headers1)
    html = urllib.request.urlopen(req)
    scode = html.read().decode('utf-8', 'ignore')
    return scode

Example #12

0

Show file

def getSeniority(linkList):
    myList = []
    for link in linkList:
        html = urlopen(link)
        bs = BeautifulSoup(html.read(), 'html.parser')

        seniority = bs.find(
            'div', {'col star-section text-center active'}).findNext('p')
        myList.append(seniority.get_text())
    return myList

Example #13

0

Show file

File: main.py Project: ugis70194/codeforces_pdf_generator

 def __init__(self, url):
     print("load codeforces contest %s" % url)
     base = urlparse(url).netloc
     html = request.urlopen(url)
     self.dom = lxml.html.fromstring(html.read())
     self.contest_id = CFContest.get_contest_id(url)
     self.pdf_name = "CF" + self.contest_id + ".pdf"
     self.problems = []
     for problem_a_tag in self.dom.xpath('//table[@class="problems"]/tr[position() > 1]/td[1]/a'):
         self.problems.append(CFProblem("https://" + base + problem_a_tag.attrib['href']))

Example #14

0

Show file

File: script.py Project: santoshghimire/scraping-with-lxml-and-mongo

 def get_all_functions(self, passedurl, topics):
     '''open the function page for parsing'''
     html = urllib.urlopen(passedurl)
     html = html.read()
     maintree   = etree.parse(StringIO(html), self.parser)
     mainContent = maintree.xpath("//div[@class='section']")     #scrape main div containing data
     if self.url=='http://docs.scipy.org/doc/scipy/reference/':
         self.scrape_section(mainContent[0], topics, scipy_first=True)
     else:
         self.scrape_section(mainContent[0], topics)

Example #15

0

Show file

 def get_html(self, url):
     opener = urllib2.build_opener()
     # agence.santemontreal.qc.ca seems to prohibit access (403) to "custom" http agents (
     # like urllib2 one) ; by forcing User-agent we workaround the problem:
     opener.addheaders = [('User-agent', 'Mozilla/5.0')]
     try:
         html = opener.open(url)
     except Exception as e:
         self.exit(STATES.UNKNOWN, 'Error while opening url: %s' % str(e))
     if html.getcode() >= 400:
         self.exit(STATES.UNKNOWN, 'HTTP error: %d' % html.getcode())
     return html.read()

Example #16

0

Show file

File: emergency_rooms_quebec.py Project: arthtux/monitoring-tools

 def get_html(self, url):
     opener = urllib2.build_opener()
     # agence.santemontreal.qc.ca seems to prohibit access (403) to "custom" http agents (
     # like urllib2 one) ; by forcing User-agent we workaround the problem:
     opener.addheaders = [('User-agent', 'Mozilla/5.0')]
     try:
         html = opener.open(url)
     except Exception as e:
         self.exit(STATES.UNKNOWN, 'Error while opening url: %s' % str(e))
     if html.getcode() >= 400:
         self.exit(STATES.UNKNOWN, 'HTTP error: %d' % html.getcode())
     return html.read()

Example #17

0

Show file

File: parseHtml.py Project: maxim-petrov-fogstream/web_crawler_demo

 def request(self, url, params={}, timeout=180):
     error = None
     for x in range(0, settings.http_tries):
         try:
             if params:
                 params = urllib.urlencode(params)
                 html = urllib2.urlopen(url, params, timeout)
             else:
                 html = urllib2.urlopen(url)
             return html.read()
         except Exception as e:
             error = e
     raise error

Example #18

0

Show file

File: promua.py Project: vascular1/Prom.ua

def getTitleAll(url, t1, t2, t3):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read())
        title = bsObj.body.h1
        price = bsObj.findAll(t1, attrs={t2: t3})
        print(title.get_text())
        for el in price:
            print(el.get_text())
    except AttributeError as e:
        return None
    return price

Example #19

0

Show file

File: promua.py Project: vascular1/Prom.ua

def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read())
        title = bsObj.body.h1
        price = bsObj.findAll("span", attrs={"class": "cost"})
        print(title.get_text())
        for el in price:
            print(el.get_text())
    except AttributeError as e:
        return None
    return price

Example #20

0

Show file

File: startbaidu.py Project: ReggieFan/spider

def search(s):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
    }

    req = request.Request('https://baike.baidu.com/item/' + quote(s, 'utf-8'),
                          headers=headers)
    html = urlopen(req)
    bsObj = BeautifulSoup(html.read(), "html.parser")
    bs = bsObj.find_all(name='div', attrs={'class': 'para'})
    content = ""
    for i in bs:
        content = f'{content}{i.text}'
    return content

Example #21

0

Show file

File: post.py Project: demid5111/nlp

def expert_prepare(_url):
    dictionary = {}

    db = MySQLdb.connect(host='localhost', user='******',passwd='123qwe', db='infoport', charset='utf8', init_command='SET NAMES UTF8')
    cursor = db.cursor()
    cursor.execute('select interest, article_id from exbd')
    result = cursor.fetchall()
    i = 0
    k = 0
    listkeys = []
    dictkeys = {}
    for record in result:
        if record[i+1] == _url:#!=
            dictkeys[k] = record[i]
            k=k+1
            #listkeys.append(record[i])

    dictionary['keyword'] = dictkeys
            #dictionary['keyword'] = dictkeys.get('keys')
            #dictionary['keyword'] = listkeys


    #print dictionary['keyword']


    html = urllib.urlopen(_url)
    doc = lxml.html.document_fromstring(html.read().decode('utf-8', 'ignore'))
    post = doc.cssselect('div.main .person-appointment-title')[0]
    dictionary['pos'] = post.text  #.encode('utf-8')
    academictitle = doc.cssselect('div.main .person-appointment-title')[0]
    dictionary['academic_title'] = academictitle.text  #.encode('utf-8')
    fio = doc.cssselect('div.footer__breadcrumbs .b ')[0]  #ФИО
    dictionary['fio'] = fio.text  #.encode('utf-8')
    items = doc.cssselect('div.g-pic')
    for item in items:
        image = item.get('style')
    s = image.split("'")
    page = 'http://www.hse.ru' + s[1]
    person_id = page.split("/")
    dictionary['person_id'] = person_id[6]
    #print page#адрес страницы, где находится фотография
    place = doc.cssselect('div.main .person-appointment-title + .link')
    #dictionary['place'] = place[0].text
    #print place[1].text #вывод ГОРОДА
    dictionary['photo'] = page
    #json_data = json.dumps(dictionary)
    #print json_data
    return dictionary

Example #22

0

Show file

File: main.py Project: melbaa/ircfw

 def use(self, rawcommand):
     if not len(rawcommand):
         return
     cooked = urllib.parse.urlencode({"search": rawcommand})
     html = urllib.request.urlopen(
         "http://t-rechnik.info/search.php?" + cooked)
     html = html.read().decode("utf8")
     root = lxml.html.fromstring(html)
     tbl = root.get_element_by_id("table")
     if len(tbl) == 3:
         txt = tbl[2].text_content()
         txt = re.sub(r"\r|\n", " ", txt)
         txt = re.sub(r"\s+", " ", txt)
         self.bot.privmsg(self.bot.sender[0], txt, option="multiline")
         return
     return "nothing found"

Example #23

0

Show file

def Scrape(tech, city, starting_page: int, ending_page: int):
    generalList = []
    #generalList.append('Job Title;;;Employer Name;;;Salary;;;Link;;;Seniority;;;describtion;;;experience')

    if not city: city = 'warszawa'

    for i in range(starting_page, ending_page + 1):
        #print("Trying crawling on page "+ str(i) + "/" + str(ending_page))
        if tech:
            url = 'https://nofluffjobs.com/pl/jobs/' + city + '/' + tech + '?criteria=city%3D' + \
                city + '%20' + tech + '&page=' + str(i)
        else:
            url = 'https://nofluffjobs.com/pl/jobs/' + city + '?criteria=city%3D' + \
                  city + '&page=' + str(i)
    # dodac sytaucje kiedy ani tech ani lokalizacja nie jest podana
        try:
            html = urlopen(url)
            #print("HTML found (1/3)")
        except HTTPError as e:
            #print('HTML does not exist')
            break
        except URLError as e:
            #print("Server not found")
            break
        else:
            pass
            #print("Successfully connected to the server! (2/3)")

        bs = BeautifulSoup(html.read(), 'html.parser')

        title = getTitle(bs)
        employer = getEmployer(bs)
        salary = getSalary(bs)
        link = getLinks(bs)
        seniority = getSeniority(link)
        desc = getDescription(link)
        experience = getExperience(desc)
        for i in range(countOffers(bs)):
            #Tutaj tworzy te obiekty oferta
            jobOffer = Oferta(title[i], employer[i], salary[i], link[i],
                              seniority[i], desc[i], experience[i])
            # jobOffer = "%s;;;%s;;;%s;;;%s;;;%s;;;%s;;;%s" % (title[i], employer[i], salary[i],
            #                                      link[i], seniority[i], desc[i], experience[i])
            generalList.append(jobOffer)

    return generalList

Example #24

0

Show file

File: WebScraping.py Project: real-anay/W3Resource-Python-CPP-Java

def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read(), "lxml")
        title = bsObj.body.h1
    except AttributeError as e:
        return None
    return title

    title = getTitle(url)
    if title == None:
        return "Title could not be found"
    else:
        return title

Example #25

0

Show file

File: script.py Project: santoshghimire/scraping-with-lxml-and-mongo

    def main(self):
        '''Scrapes function name, argument list, description for argument, URL for description, URL for examples.'''
        html = urllib.urlopen(self.url)
        html = html.read()
        maintree   = etree.parse(StringIO(html), self.parser)
        mainContent = maintree.xpath("//div[@class='section']")     #scrape main div containing data

        main_h1 = [ child for child in mainContent[0].iterchildren('h1') ]      #get its child h1
        contentHTML= (etree.tostring(main_h1[0], pretty_print=True))
        tree   = etree.parse(StringIO(contentHTML), self.parser)
        title_text = tree.xpath("//text()")[0].strip()      #title_text

        all_content = [ child for child in mainContent[0].iterchildren('div') ]     # get its child div
        contentHTML= (etree.tostring(all_content[0], pretty_print=True))
        tree   = etree.parse(StringIO(contentHTML), self.parser)
        all_content_class = tree.xpath("//@class")[0].strip()      
        if all_content_class=='toctree-wrapper compound':
            main_ul = [ child for child in all_content[0].iterchildren('ul') ]      #get its child ul    
        else:
            main_ul = [ child for child in all_content[1].iterchildren('ul') ]      #get its child ul

        main_li = [ child for child in main_ul[0].iterchildren('li') ]      #get its child li
        for each_li in main_li:
            main_a = [ child for child in each_li.iterchildren('a') ]      #get its child a
            sectionHTML= (etree.tostring(main_a[0], pretty_print=True))
            tree   = etree.parse(StringIO(sectionHTML), self.parser)
            main_topic = ' '.join(tree.xpath("//text()")).encode('utf-8').strip()
            main_topic_link = tree.xpath("//@href")[0].encode('utf-8').strip()
            # main_topic, main_topic_link

            sub_ul = [ child for child in each_li.iterchildren('ul') ]      #get its child ul
            if len(sub_ul)!=0:
                sub_li = [ child for child in sub_ul[0].iterchildren('li') ]      #get its children li
                for each_sub_li in sub_li:
                    sectionHTML= (etree.tostring(each_sub_li, pretty_print=True))
                    tree   = etree.parse(StringIO(sectionHTML), self.parser)
                    sub_topic = ' '.join(tree.xpath("//text()")).encode('utf-8').strip()
                    sub_topic_link = tree.xpath("//@href")[0].encode('utf-8').strip()
                    topics = {'main_topic': main_topic, 'main_topic_link': self.url+main_topic_link,
                    'sub_topic': sub_topic, 'sub_topic_link': self.url+sub_topic_link}
                    self.get_all_functions(topics['sub_topic_link'], topics)
            else:
                topics = {'main_topic': main_topic, 'main_topic_link': self.url+main_topic_link,
                    'sub_topic': '', 'sub_topic_link': ''}
                self.get_all_functions(topics['main_topic_link'], topics)

Example #26

0

Show file

    def save_model(self, request, obj, form, change):
        if obj and form.is_valid():
            toc = None
            excerpt = None
            if 'original_file' in form.changed_data:
                if obj.html_file:
                    obj.html_file.delete(save=False)
                f = request.FILES['original_file']
                html = _original_file_to_html(f)
                obj.html_file.save(obj.title+'.html', html, save=False)
                obj.html_file.close()
                html.seek(0)
                htmltree = lxml.html.fromstring(html.read().decode('utf-8'))
                toc = get_html_toc(htmltree)
                excerpt = get_html_excerpt(htmltree)
                f.close()

            obj.save(toc, excerpt)

Example #27

0

Show file

File: utils.py Project: jajajasalu2/patch-finder

def parse_web_page(url, xpaths=None, links=False):
    """Parse a response returned by a URL.

    The response can be parsed on the basis of xpaths determined by the URL's
    Resource instance or the xpaths given. If the response is to be parsed based
    on the former, the xpaths can be normal or related to link extraction, and
    thus patch-finding/recursion.

    Args:
        url (str): The URL to be parsed.
        xpaths (list[str]): A list of xpaths to parse the response with respect
            to. Defaults to None. If None, the xpaths are taken from the URL's
            corresponding Resource instance.
        links (bool): If True, the links xpaths are used from the corresponding
            Resource, else the normal xpaths are. Defaults to False.

    Returns:
        list[str]: A list of strings scraped from the determined or given
            xpaths.

    Raises:
        Exception: If there is an error in opening the given URL.
    """
    logger.info("Opening %s...", url)
    try:
        html = urllib.request.urlopen(url)
    except urllib.error.HTTPError:
        raise Exception("Error opening {url}".format(url=url))
    logger.info("Crawled %s", url)

    search_results = []
    if not xpaths:
        if not links:
            xpaths = Resource.get_resource(url).normal_xpaths
        else:
            xpaths = Resource.get_resource(url).links_xpaths
    elements = lxml.html.fromstring(html.read())
    for element in elements:
        if element.tag != "body":
            continue
        for xpath in xpaths:
            search_results.extend(element.xpath(xpath))
        break
    return search_results

Example #28

0

Show file

File: main.py Project: ugis70194/codeforces_pdf_generator

    def __init__(self, url):
        print("load codeforces problem %s" % url)
        html = request.urlopen(url)
        self.problem_id = CFProblem.get_problem_id(url)
        self.pdf_name = 'CF' + self.problem_id + '.pdf'
        self.dom = lxml.html.fromstring(html.read())
        self.contest_name = self.dom.xpath('//*[@id="sidebar"]/div[1]/table/tbody/tr[1]/th/a')[0].text

        base_tag = lxml.html.Element('base', href="https://%s" % urlparse(url).netloc)
        style_tag = lxml.html.Element('style')
        style_tag.text = '#pageContent>*:not(.problemindexholder) { display: none !important; } #header { display: none; } #footer { display: none; } .roundbox.menu-box { display: none; } #sidebar { display: none; } #body > br:nth-child(8) { display: none; } #pageContent { margin-right: 0 !important; } #body { padding-top: 0; } #MathJax_Message { display: none !important; }'
        self.dom.xpath('//html')[0].insert(0, base_tag)
        self.dom.xpath('//head')[0].append(style_tag)

        contest_tag = lxml.html.Element('div')
        contest_tag.text = self.contest_name
        #contest_tag.attrib['class'] = 'title'
        contest_tag.attrib['style'] = 'text-align: left;'
        self.dom.xpath('//*[@class="header"]')[0].insert(0, contest_tag)

Example #29

0

Show file

    def get_prefectures(self):
        # STEP.1 都道府県ののリストを取得

        try:
            html = urlopen(URL_TOP)
            soup = BeautifulSoup(html.read(), "lxml")
            links = soup.select("table tr td a")

            for link in links:
                exclusion = str(link).count('HOME') or str(link).count(
                    '都道府県') or str(link).count('メール送信')
                if exclusion:
                    continue
                href = link.get('href')
                self.pref_list.append({'url': href, 'name': link.text})

        except Exception as e:
            print('-----page not found.-----')
            print(e)
            self.pref_list = None

Example #30

0

Show file

    def _load_html(self, html, parser=lxml.html.parse):
        self.form_files = {}

        if hasattr(html, 'seek'):
            html.seek(0)

        if isinstance(html, (unicode, str)):
            html = StringIO(html)

        if isinstance(html, requests.Response):
            html = StringIO(html.content)

        if len(html.read()) == 0:
            self.document = None
            return None
        else:
            html.seek(0)
            self.document = parser(html)

            return html

Example #31

0

Show file

File: afl_2013.py Project: carriercomm/scraperwiki-scraper-vault

def processRounds(roundURLs):
    for roundURL in roundURLs:
        html = urllib2.urlopen(siteURL + roundURL)
        roundPage = lxml.html.fromstring(html.read())
        html.close()
        
        round = roundPage.cssselect("li[id='tpRound'] a")[0].text_content().replace("round ", "").replace(" Rankings", "").strip()
        print "Round: " + round
        
        roundRows = roundPage.cssselect("div[id='view_standard'] tr")
        # specified in the footer
        pageLinks = roundRows[-1].cssselect("a")
        
        #remove the "next page" link
        del pageLinks[-1]
        
        for link in pageLinks:
            linkURL = siteURL + link.get("href")
            print linkURL
        
            scrapePage(linkURL, round)

Example #32

0

Show file

File: test7.py Project: demid5111/nlp

def main():
    html = urllib.urlopen(url)
    doc = lxml.html.document_fromstring(html.read().decode('utf-8', 'ignore'))
    post = doc.cssselect('div.main .person-appointment-title')[0]
    print post.text#должность
    post1 = urllib.urlencode(post)
    #print p
    academictitle = doc.cssselect('div.main .person-appointment-title')[1]
    print academictitle.text#ученое звание
    academictitle1 = urllib.urlencode(academictitle)
    fio = doc.cssselect('div.footer__breadcrumbs .b ')[0]#ФИО
    print fio.text#ФИО
    fio1 = urllib.urlencode(fio)
    items = doc.cssselect('div.g-pic')
    for item in items:
        image = item.get('style')
        #print image
    s = image.split("'")
    #print s[1]
    page = 'hse.ru'+s[1]#адрес страницы, где находится фотография
    print page #hse.ru/pubs/share/direct/138568616
    dictionary = {'post':post1,'academic title': academictitle1, 'fio': fio1, 'photo': page}#словарь который нужно преобразовать для JSON
    print dictionary #print dictionary {'academic title': 'class=person-appointment-title', 'post': 'class=person-appointment-title', 'fio': 'class=b', 'photo': 'hse.ru/pubs/share/direct/138568616'}
    #print(json.dumps((d),sort_keys=True))
    json_data = json.dumps(dictionary)
    print (json.dumps(dictionary, sort_keys=True, indent=4, separators=(',', ': ')))
    #Результат печати словаря в формате JSON данных
    # {
        #"academic title": "class=person-appointment-title",
        #"fio": "class=b",
        #"photo": "hse.ru/pubs/share/direct/138568616",
        #"post": "class=person-appointment-title"
    # }
    # Почему то значения ключей печатаются не само значение поля, например как с ФИО, а печатается только имя класса, где находится?
    # Итак со всеми нужными нам полями
    elements_json = json.loads(json_data)
    print elements_json["post"]#доступ по ключу
    #class=person-appointment-title
    return json_data

Example #33

0

Show file

    def get_cities(self):
        # STEP2. 市区町村のリストを取得
        if self.pref_list is None:
            return

        for pref in self.pref_list:

            target_url = URL_TOP + pref['url']

            try:
                df = pd.DataFrame(columns=df_columns)
                html = urlopen(target_url)
                soup = BeautifulSoup(html.read(), "lxml")
                links = soup.select("center table tr td a")

                for link in links:
                    if str(link).count('☆'):
                        continue
                    href = link.get('href')
                    arr = href.split("//")
                    domain = arr[1]
                    domain = domain[:-1]

                    data = {
                        "pref": pref['name'],
                        "name": link.text,
                        "top_url": href,
                        'domain': domain
                    }
                    df = df.append(data, ignore_index=True)
                    print(data)

                self.pref_df = pd.concat([self.pref_df, df])

            except Exception as e:
                print('-----page not found.-----')
                print(e)

Example #34

0

Show file

File: utils.py Project: peterkurishev/bnsearch

    def __get_flat_details__(self, link, flat_params):
        """
        Функция получает url страницы с информацией о
        квартире. Возвращает словарь название параметра (как на
        странице) -> значение
        """
        url = settings.SITE_ROOT + link

        html = self.__get_url__(url)
        xhtml = lxml.html.fromstring(html.read())

        cells = xhtml.xpath(settings.DETAIL_CELLS_XPATH)

        result = dict()
        result[u"URL"] = url

        for i in range(len(cells) / 2):
            value = cells.pop().text_content()
            name = cells.pop().text_content()
            name = re.sub(":", "", name)
            name = name.strip()
            if name in flat_params:
                result[name] = value
        return result

Example #35

0

Show file

File: crawlingUser.py Project: chan-p/My-Research

 def __init(self,url):
     self.url = url
     html = urllib2.urlopen(url)
     self.source = lxml.html.fromstring(html.read())

Example #36

0

Show file

File: xpathdemo.py Project: ClericPy/pyld

# python3
import urllib.request
import lxml.html
import re

url = 'http://em.scnu.edu.cn/article/xueyuantongzhi/zonghe/'
html = urllib.request.urlopen(url)
scode=html.read().decode('utf-8')

doc = lxml.html.document_fromstring(scode)
ss = doc.xpath("""//div[@class="c_news"]/ul/li/a/font/text()|//div[@class="c_news"]/ul/li/a/text()""")
bb = doc.xpath("""//div[@class="c_news"]/ul/li/span/text()""")

aa= list(zip(ss,bb))

print(aa)

Example #37

0

Show file

File: scrapedata.py Project: santoshghimire/AccidentAnalysis

def accident_records():
    print "reached"
    all_accidents = []
    for file_name in range(29):
        file_name = APP_ROOT + "/accidentApp" + "/try/" + str(
            file_name) + ".html"
        print file_name
        try:
            html = urllib.urlopen(file_name)
        except:
            continue
        html = html.read()
        i = 1

        while True:
            if i == 1:
                my_iter = 1
                my_iter2 = 3
            else:
                my_iter = 0
                my_iter2 = 0
            root1 = lxml.html.fromstring(html)
            try:
                main_content = root1.cssselect('div#pf' + str(i))
                i += 1

            except:
                break
            print main_content
            if main_content == []:
                break

            node = main_content[0]
            try:
                content_date = node.cssselect('div.x4')[my_iter:]
                content_time = node.cssselect('div.x4 div.t')[my_iter:]
                content_location = node.cssselect('div.x4 div.t')[my_iter:]

                death_1 = node.cssselect('div.x12')[my_iter2:]
                death_2 = node.cssselect('div.x1d')[my_iter:]
                death_3 = node.cssselect('div.x1e')[my_iter:]
                death_4 = node.cssselect('div.x1f')[my_iter:]

                injury_1 = node.cssselect('div.x13')[my_iter2:]
                injury_2 = node.cssselect('div.x20')[my_iter:]
                injury_3 = node.cssselect('div.x21')[my_iter:]
                injury_4 = node.cssselect('div.x22')[my_iter:]

                injury2_1 = node.cssselect('div.x14')[my_iter2:]
                injury2_2 = node.cssselect('div.x23')[my_iter:]
                injury2_3 = node.cssselect('div.x24')[my_iter:]
                injury2_4 = node.cssselect('div.x25')[my_iter:]

                vehicle_1 = node.cssselect('div.x15')
                vehicle_2 = node.cssselect('div.x26')
                vehicle_3 = node.cssselect('div.x27')
                vehicle_4 = node.cssselect('div.x28')
                vehicle_5 = node.cssselect('div.x29')
                vehicle_6 = node.cssselect('div.x2a')
                vehicle_7 = node.cssselect('div.x2b')
                vehicle_8 = node.cssselect('div.x2c')

                vehicle_damaged = node.cssselect('div.x18')[1:]
                rows = zip(content_date, content_time, content_location,
                           death_1, death_2, death_3, death_4, injury_1,
                           injury_2, injury_3, injury_4, injury2_1, injury2_2,
                           injury2_3, injury2_4, vehicle_1, vehicle_2,
                           vehicle_3, vehicle_4, vehicle_5, vehicle_6,
                           vehicle_7, vehicle_8, vehicle_damaged)

            except:
                pass

            for item in rows:
                try:
                    print "------------------------------"
                    accident = {}
                    my_date = map_number(item[0].cssselect("div.t")
                                         [0].text_content().split()[0])
                    print my_date
                    accident["year"] = my_date.split(".")[0]
                    accident["month"] = my_date.split(".")[1]
                    accident["day"] = my_date.split(".")[2]

                    time = map_number(item[0].cssselect("div.t")
                                      [1].text_content().split()[0])
                    accident["hour"] = time.split(":")[0]
                    accident["minute"] = time.split(":")[1]

                    accident["location"] = item[0].cssselect(
                        "div.t")[2].text_content().strip()
                    death = 0
                    for each_death in item[3:7]:
                        death += int(each_death.text_content().strip() or 0)

                    injury = 0
                    for each_injury in item[7:15]:
                        injury += int(each_injury.text_content().strip() or 0)
                    accident["death"] = death
                    accident["injury"] = injury
                    accident["vehicle_damaged"] = int(
                        item[-1].text_content().strip() or 0)

                    all_accidents.append(accident)
                    #print all_accidents
                except:
                    pass
    print all_accidents
    return all_accidents

Example #38

0

Show file

File: sujit-2.py Project: flyeven/scraperwiki-scraper-vault

import re    
import xml.etree.ElementTree as ET

# Blank Python
#import json #for json decoding
from lxml import etree     
from cStringIO import StringIO
import urllib

import re

totalLinks =[]
for i in range(21)[1:]:
    strAddr = "http://codingtrying.herobo.com/"+str(i)+".html"
    html = urllib.urlopen(strAddr)
    html = html.read()
    parser = etree.HTMLParser()
    tree   = etree.parse(StringIO(html), parser)
    mainContent = tree.xpath("//th[@class='rowA']/a/@href")
    for content in mainContent:
        if content !="http://www.dlapiper.com/us/people/#":
            totalLinks.append(content)




i=0;
for url in totalLinks:
    if i<=481:
        i=i+1
        continue

Example #39

0

Show file

File: script.py Project: santoshghimire/scraping-with-lxml-and-mongo

    def get_function_details(self, func_details, topics):
        html = urllib.urlopen(func_details['function_link'])
        html = html.read()
        self.parser = etree.HTMLParser()
        maintree   = etree.parse(StringIO(html), self.parser)
        mainContent1 = maintree.xpath("//dl[@class='method']")     #scrape main div containing data
        mainContent2 = maintree.xpath("//dl[@class='function']")     #scrape main div containing data
        if len(mainContent1)==0 and len(mainContent2)!=0:
            mainContent = mainContent2
        elif len(mainContent2)==0 and len(mainContent1)!=0:
            mainContent = mainContent1
        elif len(mainContent1)==0 and len(mainContent2)==0:
            return
        argument_list = [ child for child in mainContent[0].iterchildren('dt') ]      #get its child dt    
        contentHTML= (etree.tostring(argument_list[0], pretty_print=True))
        tree   = etree.parse(StringIO(contentHTML), self.parser)
        argument_list = tree.xpath("//text()")
        argument_list = ''.join(argument_list[1:len(argument_list)-1]).encode('utf-8').strip()
        
        # getting details for each args
        split_data = argument_list.split('(')
        full_function_name = split_data[0]
        sec_split_data = split_data[1].split(')')
        args = sec_split_data[:-1]
        arg_dict = {}
        if len(args)!=0:
            args = args[0].split(',')
            for each_arg in args:
                each_split = each_arg.split('=')
                if len(each_split)==1:
                    if each_arg.find('.')== -1:
                        arg_dict[each_arg] = {'optional_flag': 0, 'default_value': ''}
                else:
                    if each_split[0].find('.')== -1:
                        arg_dict[each_split[0]] = {'optional_flag': 1, 'default_value': each_split[1]}

        # parsing examples
        examples = ''
        dd =  [ child for child in mainContent[0].iterchildren('dd') ]      #get its child dd
        example_div =  [ child for child in dd[0].iterchildren('div') ]      #get its child div
        if len(example_div)!=0:
            contentHTML= (etree.tostring(example_div[0], pretty_print=True))
            tree   = etree.parse(StringIO(contentHTML), self.parser)
            example_div_class = tree.xpath("//@class")
            if example_div_class[0] == 'highlight-python':
                examples = tree.xpath("//text()")
                examples = ''.join(examples)

        parameters_table = [ child for child in mainContent[0].iterdescendants('table') ]      #get its child table
        if len(parameters_table)!=0:
            contentHTML= (etree.tostring(parameters_table[0], pretty_print=True))
            tree   = etree.parse(StringIO(contentHTML), self.parser)
            table_class = tree.xpath("//@class")
            if table_class[0] == 'docutils field-list':
                all_desc = [ child for child in parameters_table[0].iterdescendants('tr') ]      #get its child tr            
                # for parameters
                argument_desc = [ child for child in all_desc[0].iterchildren('td') ]      #get its child td       
                contentHTML= (etree.tostring(argument_desc[0], pretty_print=True))
                tree   = etree.parse(StringIO(contentHTML), self.parser)
                argument_desc_list = tree.xpath("//text()")
                para_arg={}
                para_arg['argument_desc'] = ''.join(argument_desc_list).encode('utf-8').strip()
                # for returns
                if len(all_desc) == 2:
                    parameter_desc = [ child for child in all_desc[1].iterchildren('td') ]      #get its child td
                    contentHTML= (etree.tostring(parameter_desc[0], pretty_print=True))
                    tree   = etree.parse(StringIO(contentHTML), self.parser)
                    parameter_desc_list = tree.xpath("//text()")
                    para_arg['parameter_desc'] = ''.join(parameter_desc_list).encode('utf-8').strip()
                para_arg['parameter_desc'] = para_arg.get('parameter_desc') if para_arg.get('parameter_desc')!=None else ''

                # final_data = {'function_name':func_details['function_name'],
                final_data = {'function_name':full_function_name,
                            'function_link':func_details['function_link'],
                            'function_description':func_details['function_desc'],
                            'argument_list':arg_dict,
                            'argument_description':para_arg['argument_desc'],
                            'return_parameter':para_arg['parameter_desc'],
                            'examples': examples,
                            'sub_topic':topics['sub_topic'],
                            'sub_topic_link':topics['sub_topic_link'],
                            'main_topic':topics['main_topic'],
                            'main_topic_link':topics['main_topic_link']}
                #write to mongodb
                self.mongo_obj.write_data(self.table_name, final_data)


        else:
            final_data = {'function_name':full_function_name,
                            'function_link':func_details['function_link'],
                            'function_description':func_details['function_desc'],
                            'argument_list':arg_dict,
                            'argument_description':'',
                            'return_parameter':'',
                            'examples': examples,
                            'sub_topic':topics['sub_topic'],
                            'sub_topic_link':topics['sub_topic_link'],
                            'main_topic':topics['main_topic'],
                            'main_topic_link':topics['main_topic_link']}
            self.mongo_obj.write_data(self.table_name, final_data)

Example #40

0

Show file

File: scrapedata.py Project: kaflesudip/AccidentAnalysis

def accident_records():
    print "reached"
    all_accidents =[]
    for file_name in range(29):
        file_name = APP_ROOT+"/accidentApp"+"/try/"+str(file_name)+".html"
        print file_name
        try:
            html = urllib.urlopen(file_name)
        except:
            continue
        html = html.read()
        i =1

        while True:
            if i ==1:
                my_iter = 1
                my_iter2 = 3
            else:
                my_iter = 0
                my_iter2 = 0
            root1 = lxml.html.fromstring(html)
            try:
                main_content = root1.cssselect('div#pf'+str(i))
                i += 1

            except:
                break
            print main_content
            if main_content == []:
                break


            node = main_content[0]
            try:
                content_date = node.cssselect('div.x4')[my_iter:]
                content_time = node.cssselect('div.x4 div.t')[my_iter:]
                content_location = node.cssselect('div.x4 div.t')[my_iter:]
                
                death_1 = node.cssselect('div.x12')[my_iter2:]
                death_2 = node.cssselect('div.x1d')[my_iter:]
                death_3 = node.cssselect('div.x1e')[my_iter:]
                death_4 = node.cssselect('div.x1f')[my_iter:]    

                injury_1 = node.cssselect('div.x13')[my_iter2:]
                injury_2 = node.cssselect('div.x20')[my_iter:]
                injury_3 = node.cssselect('div.x21')[my_iter:]
                injury_4 = node.cssselect('div.x22')[my_iter:]

                injury2_1 = node.cssselect('div.x14')[my_iter2:]
                injury2_2 = node.cssselect('div.x23')[my_iter:]
                injury2_3 = node.cssselect('div.x24')[my_iter:]
                injury2_4 = node.cssselect('div.x25')[my_iter:]
                
                vehicle_1 = node.cssselect('div.x15')
                vehicle_2 = node.cssselect('div.x26')
                vehicle_3 = node.cssselect('div.x27')
                vehicle_4 = node.cssselect('div.x28')
                vehicle_5 = node.cssselect('div.x29')
                vehicle_6 = node.cssselect('div.x2a')
                vehicle_7 = node.cssselect('div.x2b')
                vehicle_8 = node.cssselect('div.x2c')
                
                vehicle_damaged = node.cssselect('div.x18')[1:]
                rows = zip(content_date, content_time, content_location,
                        death_1, death_2, death_3, death_4,
                        injury_1, injury_2, injury_3, injury_4,
                        injury2_1, injury2_2, injury2_3, injury2_4,
                        vehicle_1, vehicle_2, vehicle_3, vehicle_4, vehicle_5,
                        vehicle_6, vehicle_7, vehicle_8, vehicle_damaged)
                
            except:
                pass

            for item in rows:
                try:
                    print "------------------------------"
                    accident = {}
                    my_date =  map_number(item[0].cssselect("div.t")[0].text_content().split()[0])
                    print my_date
                    accident["year"] = my_date.split(".")[0]
                    accident["month"] = my_date.split(".")[1]
                    accident["day"] = my_date.split(".")[2]
                    

                    time =  map_number(item[0].cssselect("div.t")[1].text_content().split()[0])
                    accident ["hour"] = time.split(":")[0]
                    accident["minute"] = time.split(":")[1]

                    accident["location"] =  item[0].cssselect("div.t")[2].text_content().strip()
                    death = 0
                    for each_death in item[3:7]:
                        death+= int(each_death.text_content().strip() or 0)
                    
                    injury = 0
                    for each_injury in item[7:15]:
                        injury+= int(each_injury.text_content().strip() or 0)
                    accident["death"] = death
                    accident ["injury"] = injury
                    accident ["vehicle_damaged"] = int(item[-1].text_content().strip() or 0)
                    
                    all_accidents.append(accident)
                    #print all_accidents
                except:
                    pass
    print all_accidents
    return all_accidents

Example #41

0

Show file

File: main.py Project: BCCW/python__atwiki_backup

import time
import sys
import codecs
import lxml.html
import urllib2

query = 'http://www39.atwiki.jp/osakahennyu/?cmd=backup&action=source&pageid=<PLACEHOLDER>&num=0'

for line in open(sys.argv[1], 'r'):
	url = query.replace('<PLACEHOLDER>', line.rstrip())

	while True:
		try:
			html = urllib2.urlopen(url)

			code = unicode(html.read(), 'utf-8')
			dom  = lxml.html.fromstring(code)
			wiki = dom.xpath('//pre')[0]
			
			fout = codecs.open(line.rstrip() + '.txt', 'w', 'utf-8')
			fout.write(wiki.text)
			fout.close()

			html.close()
			break
			
		except urllib2.HTTPError:
			raw_input('>>> error! press continue...')

	time.sleep(1)

Example #42

0

Show file

import re
import xml.etree.ElementTree as ET

# Blank Python
#import json #for json decoding
from lxml import etree
from cStringIO import StringIO
import urllib

import re

totalLinks = []
for i in range(21)[1:]:
    strAddr = "http://codingtrying.herobo.com/" + str(i) + ".html"
    html = urllib.urlopen(strAddr)
    html = html.read()
    parser = etree.HTMLParser()
    tree = etree.parse(StringIO(html), parser)
    mainContent = tree.xpath("//th[@class='rowA']/a/@href")
    for content in mainContent:
        if content != "http://www.dlapiper.com/us/people/#":
            totalLinks.append(content)

i = 0
for url in totalLinks:
    if i <= 481:
        i = i + 1
        continue
    try:
        page = scraperwiki.scrape(url)
        html = bs.BeautifulSoup(page)

Example #43

0

Show file

File: script.py Project: santoshghimire/scraping-with-lxml-and-mongo

    def scrape_section(self, element, topics, scipy_first=False, all_info=None):
        if scipy_first:
            h1_topic = [ child for child in element.iterchildren('h1') ]      #get its child h1    
            actual_link = [ child for child in h1_topic[0].iterchildren('a') ]      #get its child a
            if len(actual_link)==2:
                contentHTML= (etree.tostring(actual_link[0], pretty_print=True))
                tree   = etree.parse(StringIO(contentHTML), self.parser)
                actual_link = tree.xpath("//@href")[0].split('/')
                if actual_link[0]== '..':
                    html = urllib.urlopen(self.url + actual_link[1])
                    html = html.read()
                    maintree   = etree.parse(StringIO(html), self.parser)
                    mainContent = maintree.xpath("//div[@class='section']")     #scrape main div containing data
                    self.scrape_section(mainContent[0], topics)
            else:
                return    
        else:        
            main_topics = [ child for child in element.iterchildren('div') ]      #get its child div
            for each_topic in main_topics:
                contentHTML= (etree.tostring(each_topic, pretty_print=True))
                tree   = etree.parse(StringIO(contentHTML), self.parser)
                div_class = tree.xpath("//@class")
                if div_class[0] == 'section':
                    title = [ child for child in each_topic.iterchildren('h2') ]      #get its child h2
                    mini_title, information='',''
                    if len(title)==0:
                        title = [ child for child in each_topic.iterchildren('h3') ]      #get its child h3
                    if len(title)!=0:
                        titleHTML= (etree.tostring(title[0], pretty_print=True))
                        title_tree   = etree.parse(StringIO(titleHTML), self.parser)
                        mini_title = title_tree.xpath("//text()")[0].encode('utf-8').strip()
                    if self.url == 'http://docs.scipy.org/doc/numpy/user/':
                        info = [ child for child in each_topic.iterchildren('p') ]      #get its child para
                        if len(info)!=0:
                            infoHTML= (etree.tostring(info[0], pretty_print=True))
                            info_tree   = etree.parse(StringIO(infoHTML), self.parser)
                            information = info_tree.xpath("//text()")[0].encode('utf-8').strip()
                            if all_info!=None:
                                info_details = {'mini_title': mini_title, 'mini_info': information,
                                'parent_title': all_info.get('mini_title'), 'parent_info': all_info.get('mini_info')}
                            else:
                                info_details = {'mini_title': mini_title, 'mini_info': information}
                        else:
                            info_details = {'mini_title': mini_title, 'mini_info': information}
                        self.scrape_section(each_topic, topics, all_info=info_details)

                    else:
                        self.get_func_tables(each_topic, topics)     # check if table of functions exists
                        # check if there is a section div within the div
                        self.scrape_section(each_topic, topics)   
                else:
                    if self.url == 'http://docs.scipy.org/doc/numpy/user/' and all_info!=None:
                        final_data = {'sub_topic':topics['sub_topic'],
                                    'sub_topic_link':topics['sub_topic_link'],
                                    'main_topic':topics['main_topic'],
                                    'main_topic_link':topics['main_topic_link']}
                        if all_info.get('parent_title')==None and all_info.get('parent_info')==None:
                            final_data['parent_title'] = all_info['mini_title']
                            final_data['parent_info'] = all_info['mini_info']
                            final_data['mini_title'] = ''
                            final_data['mini_info'] =''

                            self.mongo_obj.write_data(self.table_name, final_data)
                        else:
                            final_data['parent_title'] = all_info.get('parent_title')
                            final_data['parent_info'] = all_info.get('parent_info')
                            final_data['mini_title'] = all_info['mini_title']
                            final_data['mini_info'] =all_info['mini_info']

                            self.mongo_obj.write_data(self.table_name, final_data)

Example #44

0

Show file

#tf.close()
###########################################################################

######인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api###인스타그램 api#

client_id = '71b5f772fc5a467fbb4e6066ecbe9536'

access_token = "1451885321.71b5f77.ddac6f3e719c4afb8375ab1dda874fd9"
client_secret = "8fbf9fa995804da09d587ec0a3819e01"

api = InstagramAPI(access_token=access_token, client_secret=client_secret)
result = api.tag_recent_media(100, 10, moviename)

url = result[1]
html = urllib.urlopen(url)  ##데이터를 받을 수 있는 url을 urllib모듈을 통해서 오픈시키고 넣음
htmlread = html.read().decode('utf-8')  ## 읽어서 htmlread에 넣음

jjson = json.loads(htmlread)
data = jjson['data']

try:
    #saveFile = open('result.txt', 'a')

    for i in range(0, len(data)):

        a = data[i]
        tag = a['tags']

        for i in range(0, len(tag)):
            #saveFile.write(str(tag[i].encode('utf-8'))+" ")
            finaldata = finaldata + tag[i] + u" "

Example #45

0

Show file

File: vijesh.py Project: SylvanHuang/graphRecoAlgo

from BeautifulSoup import BeautifulSoup
import re
import urllib
import lxml.html
import string
import json
import pickle

for char in string.uppercase:
    movieInfoList = []
    html = urllib.urlopen('http://www.gomolo.com/indian-movies-list-films-database?SearchChar=' + char)
    soup = BeautifulSoup(html.read())

    #print soup.html.head.title.string
    items = soup.findAll("div",attrs={"id":"divMain"})[0].contents[0].contents 

    movielinks = []
    for item in items:
        try:
            movielinks.append(item.contents[0].contents[0].attrs[0][1])
        except IndexError:
            print "IndexError"
            pass

    #movielinks = ['http://www.gomolo.com/bal-hanuman-2-movie/39179']

    for link in movielinks:

        movieInfo = {}

        arr = link.split("/")

Example #46

0

Show file

File: bs_mine.py Project: manikTharaka/Autolanka_scraper

#!/usr/bin/env python

import scraperwiki
import requests
import lxml.html

from bs4 import BeautifulSoup
 
import requests


html=open('index','r')
print html.read()
soup = BeautifulSoup(html)
print(soup.pretify)

Example #47

0

Show file

File: vijesh.py Project: colinsongf/graphRecoAlgo

from BeautifulSoup import BeautifulSoup
import re
import urllib
import lxml.html
import string
import json
import pickle

for char in string.uppercase:
    movieInfoList = []
    html = urllib.urlopen(
        'http://www.gomolo.com/indian-movies-list-films-database?SearchChar=' +
        char)
    soup = BeautifulSoup(html.read())

    #print soup.html.head.title.string
    items = soup.findAll("div", attrs={"id":
                                       "divMain"})[0].contents[0].contents

    movielinks = []
    for item in items:
        try:
            movielinks.append(item.contents[0].contents[0].attrs[0][1])
        except IndexError:
            print "IndexError"
            pass

    #movielinks = ['http://www.gomolo.com/bal-hanuman-2-movie/39179']

    for link in movielinks:

Example #48

0

Show file

File: mining.py Project: manikTharaka/Autolanka_scraper

            while len(word) > index:
                val=word[index]
                index =index+1
        else:
            val="NULL"

        #print var+val+"\n"
        if val == None:
            return
        
        di[var]=val
        return val

#html = requests.get("http://www.autolanka.com/Buy.asp").content
html=open('index','r')
dom = lxml.html.fromstring(html.read())


varia=["Code:","Added:","Make:","Model:","No:","Year:","Location:","Options:","Price:","Info:"]
di={}
ads={}
for entry in dom.cssselect('.BuyDataTD'):
    [extract(var,entry,di) for var in varia]
    if len(di)==10:
        print di



    #if (len(di)==10) and not(di['Code:'].replace("Code:","") in ads):
    #   ads[di['Code:'].replace("Code:","")]=di
    #print ads

Example #49

0

Show file

File: mining.py Project: manikTharaka/Autolanka_scraper

                val = word[index]
                index = index + 1
        else:
            val = "NULL"

        #print var+val+"\n"
        if val == None:
            return

        di[var] = val
        return val


#html = requests.get("http://www.autolanka.com/Buy.asp").content
html = open('index', 'r')
dom = lxml.html.fromstring(html.read())

varia = [
    "Code:", "Added:", "Make:", "Model:", "No:", "Year:", "Location:",
    "Options:", "Price:", "Info:"
]
di = {}
ads = {}
for entry in dom.cssselect('.BuyDataTD'):
    [extract(var, entry, di) for var in varia]
    if len(di) == 10:
        print di

    #if (len(di)==10) and not(di['Code:'].replace("Code:","") in ads):
    #   ads[di['Code:'].replace("Code:","")]=di
    #print ads

Example #50

0

Show file

File: xpath_demo.py Project: WeiEast/pydemos

#! python3

import urllib.request
import lxml.html
import re

url = 'http://em.scnu.edu.cn/article/xueyuantongzhi/zonghe/'
html = urllib.request.urlopen(url)
scode = html.read().decode('utf-8')

doc = lxml.html.document_fromstring(scode)
ss = doc.xpath(
    """//div[@class="c_news"]/ul/li/a/font/text()|//div[@class="c_news"]/ul/li/a/text()""")
bb = doc.xpath("""//div[@class="c_news"]/ul/li/span/text()""")

aa = list(zip(ss, bb))

print(aa)

Example #51

0

Show file

File: 云计算1.py Project: ReggieFan/spider

}
web = {}
web['新闻'] = 'https://searchcloudcomputing.techtarget.com.cn/news/'
for key in web:
    with open('D:/' + key + '.csv', 'w', newline='',
              encoding='utf-8-sig') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(('title', 'abstract', 'type', 'content'))
        for i in range(2, 407):
            try:
                print((key + '%.2f' % ((i - 1) / 407 * 100)) + "%")
                req = request.Request(
                    'https://searchcloudcomputing.techtarget.com.cn/interviews/page/3/',
                    headers=headers)
                html = urlopen(req)
                bsObj = BeautifulSoup(html.read(), "html.parser")
                print(bsObj.text)
                bs = bsObj.find_all('h4', attrs={'class': 'newslist'})
                print(bs)
                for j in bs:
                    req = request.Request(j.find('a').get('href'),
                                          headers=headers)
                    print(j.find('a').get('href'))
                    html = urlopen(req)
                    bsObj = BeautifulSoup(html.read(), "html.parser")
                    bs = bsObj.find_all(name='div',
                                        attrs={'class': 'newslist'})
                    content = ''
                    for i in bs:
                        content = f'{content}{i.text}'
                    title = bsObj.find_all('h1')

Example #52

0

Show file

File: diggeroftheday.py Project: carriercomm/scraperwiki-scraper-vault

def getHtml(url):
    html = urllib2.urlopen(url)
    page = lxml.html.fromstring(html.read())
    html.close()

    return page