Beispiel #1
0
 def update(self):
     self.lastUpdate = datetime.now()
     for linenumber, bus in self.lines.items():
         bus.reset()
     html = self.downloader.get(self.url)
     for line in xpath.search(html, '//table//tr'):
         counter=0
         time = ''
         delay = ''
         destination = ''
         linenumber = ''
         details = ''
         for item in xpath.search(line, '/td'):
             if counter == 0:
                 hour, minutes = xpath.get(item, '/span').strip().split(':');
                 time = Time(hour, minutes)
                 delay = xpath.get(item, '/span[@class="block exclamation bold mts"]').strip().split(':')
                 if len(delay) > 1:
                     delay=Time(delay[0], delay[1], "delay")
                 else:
                     delay=Time(0, delay[0], "delay")
             elif counter == 1:
                 destination = item.strip()
             elif counter == 2:
                 linenumber = item.strip()
             elif counter == 4:
                 details = re.sub('<span.*</span>', '', item).strip()
             counter+=1
         if delay != '':
             if self.lines.has_key(linenumber + destination):
                 self.lines[linenumber + destination].find(time).delay = delay
Beispiel #2
0
 def update(self):
     self.lastUpdate = datetime.now()
     for linenumber, bus in self.lines.items():
         bus.reset()
     html = self.downloader.get(self.url)
     for line in xpath.search(html, '//table//tr'):
         counter = 0
         time = ''
         delay = ''
         destination = ''
         linenumber = ''
         details = ''
         for item in xpath.search(line, '/td'):
             if counter == 0:
                 hour, minutes = xpath.get(item, '/span').strip().split(':')
                 time = Time(hour, minutes)
                 delay = xpath.get(
                     item, '/span[@class="block exclamation bold mts"]'
                 ).strip().split(':')
                 if len(delay) > 1:
                     delay = Time(delay[0], delay[1], "delay")
                 else:
                     delay = Time(0, delay[0], "delay")
             elif counter == 1:
                 destination = item.strip()
             elif counter == 2:
                 linenumber = item.strip()
             elif counter == 4:
                 details = re.sub('<span.*</span>', '', item).strip()
             counter += 1
         if delay != '':
             if self.lines.has_key(linenumber + destination):
                 self.lines[linenumber +
                            destination].find(time).delay = delay
Beispiel #3
0
def holders_parse(html, i):
    infos = []
    h = xpath.get(html, r'//table[@class="table"]', remove=None)
    for k in xpath.search(h, r'//tr', remove=None):
        if '</td><td>' in k:
            ms = [common.normalize(m) for m in xpath.search(k, r'//td')]
            infos.append('"' + '","'.join(ms) + '"')
    return infos
Beispiel #4
0
def mal(mal_title, mal_id=False):
    cookies = {"incap_ses_224_81958":"P6tYbUr7VH9V6shgudAbA1g5FVYAAAAAyt7eDF9npLc6I7roc0UIEQ=="}
    response = requests.get(
        "http://myanimelist.net/api/anime/search.xml",
        params={'q':mal_title},
        cookies=cookies,
        auth=("zodman1","zxczxc"),
        headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'})
    content = response.content
    if not mal_id is False:
         for e in xpath.search(content,"//entry"):
             if mal_id in e:
                 content = e
                 break

    tqdm.write("%s %s"%((mal_title,), mal_id))
    id = xpath.get(content, "//id")
    title = xpath.get(content, "//title")
    title_en = xpath.get(content, "//english")
    type_ = xpath.get(content, "//type")
    synonyms = xpath.get(content, "//synonyms")
    status = xpath.get(content, "//status")
    synopsys = translate(xpath.get(content, "//synopsis"),"es")
    img  = xpath.get(content, "//image")
    episodes = xpath.get(content,"//episodes")
    resumen = synopsys.replace("&lt;br /&gt;", " ").replace("\n\r","")
    resumen = translate(resumen,'es')
    status = translate(status,'es')
    assert id is not "", mal_title

    data=dict(title=title, title_en=title_en, type=type_, status=status,
    resumen=resumen, img=img,episodes=episodes, synonyms=synonyms,id=id, synopsys=synopsys)
    return MalResult(**data)
def download_locations():
    """Find latitude longitude bounding box for this country
    """
    D = download.Download(num_retries=1)
    index_url = 'http://download.geonames.org/export/zip/'
    index_html = D.get(index_url)
    for link in xpath.search(index_html, '//pre/a/@href'):
        if link.endswith(
                '.zip') and '_full' not in link and 'allCountries' not in link:
            download_html = D.get(urlparse.urljoin(index_url, link))
            input_zip = StringIO.StringIO()
            input_zip.write(download_html)
            try:
                tsv_data = zipfile.ZipFile(input_zip).read(
                    link.replace('.zip', '.txt'))
            except zipfile.BadZipfile as e:
                print e
                del D.cache[urlparse.urljoin(index_url, link)]
                continue
            output_filename = link.replace('.zip', '_locations.csv')
            writer = csv.writer(open(output_filename, 'w'))
            found = set()
            for row in csv.reader(tsv_data.splitlines(), delimiter='\t'):
                zip_code = row[1] = row[1].split('-')[0]
                try:
                    lat, lng = float(row[9]), float(row[10])
                except ValueError:
                    print 'bad coord:', row[9], row[10]
                else:
                    if lat and lng and zip_code not in found:
                        found.add(zip_code)
                        place = row[2]
                        writer.writerow([place, zip_code, lat, lng])
            print 'Downloaded to', output_filename
Beispiel #6
0
def scrapeAmazon(gamename):
	AMA = download.Download(user_agent=None)

	search = gamename
	search = search.replace(" ","+")
	
	html = AMA.fetch("http://www.amazon.com/gp/search/ref=sr_il_ti_videogames?rh=n%3A468642%2Ck%3A{}&keywords={}&ie=UTF8&qid=1407988315&lo=videogames".format(search,search))
	if not html:
		noamazon=1
		print("Couldn't connect to Amazon's servers.")
		return noamazon

	gametitle = xpath.search(html, '//div[@class="ilt3"]//a//span[@class="lrg bold"]')
	productlinks = xpath.search(html, '//div[@class="ilt3"]//a/@href')
	gameprice = xpath.search(html, '//div[@class="ill3"]//span[@class="red bld"]')

	return (gametitle, productlinks, gameprice)
Beispiel #7
0
def process_page_bizhi(page):
    try:
        html = urllib2.urlopen(page).read()
        app_list = xpath.search(html, '//div[@id="plistbox"]/span/a/@href')
        if(not app_list):
            print "page format changed at: %s" % page
            return
        for app in app_list:
            process_bizhi_url('http://a.3533.com' + app)
        
        page_next = xpath.search(html, '//div[@class="page"]/ul/li/a[@class="next"]/@href')
        if(page_next):
            process_page_bizhi('http://a.3533.com' + page_next[0])
        else:
            print "reached at the max page or page format changed: %s" % page
    except urllib2.URLError, e:
        logger.error('process_url URLError Exception at: %s, %s' % (page, e.message))
Beispiel #8
0
def scrapeBB(gamename):
	BB = download.Download(user_agent=None)

	search = gamename
	search = search.replace(" ","+")
	
	html = BB.fetch("http://www.bestbuy.com/site/searchpage.jsp?_dyncharset=UTF-8&id=pcat17071&type=page&ks=960&st={}&sc=Global&cp=1&sp=&qp=category_facet%3DVideo+Games~abcat0700000&list=y&usc=All+Categories&nrp=15&iht=n&seeAll=".format(search))
	if not html:
		nobb=1
		print("Couldn't connect to Best Buy's servers.")
		return nobb

	gametitle = xpath.search(html, '//h3[@itemprop="name"]//a')
	productlinks = xpath.search(html, '//h3[@itemprop="name"]//a/@href')
	gameprice = xpath.search(html, '//span[@itemprop="price"]')

	return (gametitle, productlinks, gameprice)
Beispiel #9
0
def scrapeGamestop(gamename):
	GS = download.Download()

	search = gamename
	search = search.replace(" ","+")
	
	html = GS.fetch("http://www.gamestop.com/browse?nav=16k-3-{},28zu0".format(search))
	if not html:
		nogs=1
		print("Couldn't connect to Gamestop's servers.")
		return nogs
	
	gametitle = xpath.search(html, '//div[@class="product_info grid_12"]//a[1]')
	productlinks = xpath.search(html, '//div[@class="product_info grid_12"]//a[1]/@href')
	gameprice = xpath.search(html, '//p[@class="pricing"]')

	return (gametitle, productlinks, gameprice)
Beispiel #10
0
def scrapeBaramDom():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    now = datetime.now()
    down = Downloader('http://www.baramdom.com/')
    content = down.get_content()
    html = unicode(content)
    p = xpath.get(html, '//div[@class="box post"]')
    linkovi = xpath.search(p, '//div[@class="content"]')
    ads = []
    for l in linkovi:
        link = "http://www.baramdom.com" + xpath.get(l, '//div[@class="post-title"]/h2/a/@href')
        title = xpath.get(l, '//div[@class="post-title"]/h2/a')
        imageUrl = xpath.get(l, '//a[@class="grouped"]/img/@src')
        if imageUrl == "":
            imageUrl = "http://www.baramdom.com/img/apartment_noimage.png"
        else:
            imageUrl = "http://www.baramdom.com" + imageUrl
        download = Downloader(link)
        cont = download.get_content()
        cont = unicode(cont)
        description = xpath.get(cont, '//p[@class="post_add_desc"]')
        description = description.strip()
        category = u"Недвижнини"
        ost = xpath.get(l, '//p[@class="add-title"]')
        ost = ost.strip()
        ost = ost.split(" во ")
        region = ost[1]
        country = u"Македонија"
        k = ost[0]
        k = k.split("ам ")
        subcategory = k[1]
        price = xpath.get(cont, '//div[@class="post-add"]/p[@class="last"]').strip()
        price = price.split(" ")
        if len(price)==3:
            value = "/"
            currency = "/"
        else:
            value = price[0]
            currency = price[1]
            if currency == "Euro.":
                currency = "EUR"
            elif currency == u"Ден.":
                currency = "MKD"
        date = xpath.get(l, '//div[@class="fl"]')
        date = date.strip()
        date = date.split(">")
        date = date[1]
        date = date.strip()
        date = date.split(" ")
        date = date[0]
        date = date.split("-")
        date = date[2]+"-"+date[1]+"-"+date[0]
        ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country)    
        ads.append(ad)
    return adsToJson(ads)
#print scrapeBaramDom()
Beispiel #11
0
def scrapeOglasiRs():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    now = datetime.now()
    down = Downloader('http://www.oglasi.rs/pretraga/0/0/')
    content = down.get_content()
    html = unicode(content)

    linkovi = xpath.search(html, '//li[@class="clearfix"]')
    ads = []
    for l in linkovi:
        link = xpath.get(l, '//a[@class="ogl_id"]/@href')
        title = xpath.get(l, '//h2/a[@class="ogl_id"].text()')
        imageUrl ="http://oglasi.rs" + xpath.get(l, '//a[@class="ogl_id"]/img/@src')
        price = xpath.get(l, '//div[@class="ad-price"]/h3')
        datum = xpath.get(l, '//div[@class="right-side"]/div/p/strong')
        datum = datum.split(".")
        date = datum[2]+"-"+datum[1]+"-"+datum[0]
        price = price.split(" ")
        price[0] = price[0].replace(".","")
        currency = price[1]
        value = price[0]
        value = value.split(",")
        value = value[0]
        download = Downloader(link)
        ad = download.get_content()
        ad = unicode(ad)
        description = xpath.search(ad, '//div[@class="description"]/p')
        description = description[1].strip()
        category="/"
        subcategory="/"
        loc = xpath.search(ad, '//div[@class="description"]/ul[@class="clearfix"]')
        lo = xpath.search(loc[0], '//li')
        region = lo[1]
        region = region.split("(")
        region = region[0]
        region = region.strip()
        country = u"Србија"
        ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country)    
        ads.append(ad)
    return adsToJson(ads)

#print scrapeOglasiRs()
Beispiel #12
0
def scrapeBB(gamename):
    BB = download.Download(user_agent=None)

    search = gamename
    search = search.replace(" ", "+")

    html = BB.fetch(
        "http://www.bestbuy.com/site/searchpage.jsp?_dyncharset=UTF-8&id=pcat17071&type=page&ks=960&st={}&sc=Global&cp=1&sp=&qp=category_facet%3DVideo+Games~abcat0700000&list=y&usc=All+Categories&nrp=15&iht=n&seeAll="
        .format(search))
    if not html:
        nobb = 1
        print("Couldn't connect to Best Buy's servers.")
        return nobb

    gametitle = xpath.search(html, '//h3[@itemprop="name"]//a')
    productlinks = xpath.search(html, '//h3[@itemprop="name"]//a/@href')
    gameprice = xpath.search(html, '//span[@itemprop="price"]')

    return (gametitle, productlinks, gameprice)
def get_external_URL(page_html):
    seen_urls = set()
    urls = xpath.search(page_html,
                        '//section[@class="maincontainer"]//a/@href')
    for url in urls:
        #print link
        url = url.replace(archive, '')
        if url not in seen_urls:
            #num_new_articles += 1
            seen_urls.add(url)
    return seen_urls
Beispiel #14
0
def scrapeGamestop(gamename):
    GS = download.Download()

    search = gamename
    search = search.replace(" ", "+")

    html = GS.fetch(
        "http://www.gamestop.com/browse?nav=16k-3-{},28zu0".format(search))
    if not html:
        nogs = 1
        print("Couldn't connect to Gamestop's servers.")
        return nogs

    gametitle = xpath.search(html,
                             '//div[@class="product_info grid_12"]//a[1]')
    productlinks = xpath.search(
        html, '//div[@class="product_info grid_12"]//a[1]/@href')
    gameprice = xpath.search(html, '//p[@class="pricing"]')

    return (gametitle, productlinks, gameprice)
def download_content(outputfile, seen_urls):
    f = open(outputfile, 'a')
    for url in seen_urls:
        page_html = D.get(url)
        #print page_html
        #ba_cntebt_text introFirst
        contents = xpath.search(page_html,
                                '//div[@id="ba_content"]//div/text()')
        #contents = xpath.search(page_html, '//div[@class="ba_cntebt_text_introFirst"]/div/text() | //div[@class="mainText"]/div/text()')
        f.write(url + '\n')
        #for content in contents:
        #	f.write(content)
        if contents == None:
            contents = xpath.search(page_html,
                                    '//div[@class="mainpopup"]//div/text()')
#contents = xpath.search(page_html, '//div[@class="ba_cntebt_text_introFirst"]/div/text() | //div[@class="mainText"]/div/text()')
#f.write(url+'\n')
        for content in contents:
            f.write(content)
        #break
    f.close()
Beispiel #16
0
def scrapeAmazon(gamename):
    AMA = download.Download(user_agent=None)

    search = gamename
    search = search.replace(" ", "+")

    html = AMA.fetch(
        "http://www.amazon.com/gp/search/ref=sr_il_ti_videogames?rh=n%3A468642%2Ck%3A{}&keywords={}&ie=UTF8&qid=1407988315&lo=videogames"
        .format(search, search))
    if not html:
        noamazon = 1
        print("Couldn't connect to Amazon's servers.")
        return noamazon

    gametitle = xpath.search(
        html, '//div[@class="ilt3"]//a//span[@class="lrg bold"]')
    productlinks = xpath.search(html, '//div[@class="ilt3"]//a/@href')
    gameprice = xpath.search(html,
                             '//div[@class="ill3"]//span[@class="red bld"]')

    return (gametitle, productlinks, gameprice)
Beispiel #17
0
def process_ruanjian_url(url):
    try:
        # http://a.3533.com/ruanjian/4180.htm
        # charset=utf-8
        print url
        data = urllib2.urlopen(url).read()        
        down_obj = util.app_info({'market':market})
        down_obj['app_url'] = url
        down_obj['app_url_md5'] = hashlib.md5(url).hexdigest()
        
        app_name = xpath.search(data, '//div[@class="gametit"]/h1/')
        if(app_name):
            down_obj['app_name'] = app_name[0]
        
        apk_left = xpath.search(data, '//div[@class="apkleft"]/ul/li')
        if(apk_left):
            found1 = re.search('([.\d]+)', apk_left[0])
            if(found1):
                down_obj['version'] = found1.group(1)
            
            found2 = re.search('([.\d]+)([MK])', apk_left[4])
            if(found2):
                if(found2.group(2) == 'M'):
                    down_obj['size'] = int(float(found2.group(1)) * 1024 * 1024)
                else:
                    down_obj['size'] = int(float(found2.group(1)) * 1024)
        
        short_url = xpath.search(data, '//div[@class="apkdown"]/a/@href')
        if(short_url):
            opener = urllib2.build_opener(util.RedirectHandler)
            apk_url= opener.open(short_url[0]).geturl()
            down_obj['download_link'] = apk_url
            
            print down_obj
            util.sql_do(down_obj)
            util.put_job(down_obj)
            global cnt_all
            cnt_all += 1
    except urllib2.URLError, e:
        logger.error('process_url Exception at: %s, %s' % (url, e.message))    
Beispiel #18
0
def scrapeNedviznostiMakedonija():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    now = datetime.now()
    down = Downloader(
        'http://www.nedviznostimakedonija.com.mk/Default.aspx?search=1')
    content = down.get_content()
    html = unicode(content)
    linkovi = xpath.search(html, '//div[@class="boxesResultNewTop"]')
    ads = []
    for l in linkovi:
        link = "http://www.nedviznostimakedonija.com.mk/" + xpath.get(
            l, '//a[@class="subjectLook nobackim"]/@href')
        title = xpath.get(l, '//a[@class="subjectLook nobackim"]').strip()
        imageUrl = "http://www.nedviznostimakedonija.com.mk/" + xpath.get(
            l, '//a[@class="nobackim"]/img/@src')
        download = Downloader(link)
        cont = download.get_content()
        cont = unicode(cont)
        description = xpath.get(
            cont, '//span[@id="Body1_DetailControl1_FormView1_Label5"]')
        category = u"Недвижнини"
        subcategory = "/"
        price = xpath.get(
            l,
            '//div[@style="float:right; color:#1b5474; font-size:14px; font-weight:bold;"]/span'
        )
        price = price.split(" ")
        price[0] = price[0].replace(".", "")
        if price[1] == "&#8364;":
            price[1] = "EUR"
        else:
            price[1] = "MKD"
        value = price[0]
        currency = price[1]
        region = xpath.get(
            cont,
            '//span[@id="Body1_DetailControl1_FormView1_cityDescriptionLabel"]'
        )
        country = u"Македонија"
        date = xpath.get(
            cont, '//span[@id="Body1_DetailControl1_FormView1_LabelDate"]')
        date = date.split(".")
        date = date[2] + "-" + date[1] + "-" + date[0]
        ad = Ad(link, title, imageUrl, description, category, subcategory,
                value, currency, region, date, country)
        ads.append(ad)
    return adsToJson(ads)


#print scrapeNedviznostiMakedonija()
Beispiel #19
0
def scrapeVipMarket5():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    now = datetime.now()
    down = Downloader('http://www.vipmarket5.mk/search/')
    content = down.get_content()
    html = unicode(content)
    linkovi = xpath.search(html, '//tr[@class="frame_content"]')
    ads = []
    for l in linkovi:
        link = "http://www.vipmarket5.mk" + xpath.get(l, '//div[@style="width:365px; height:90%; margin-top:10px;"]/b/a/@href')
        title = xpath.get(l, '//div[@style="width:365px; height:90%; margin-top:10px;"]/b/a')
        imageUrl = xpath.get(l, '//div[@style="overflow:hidden; width:150px; height: 146px; margin: 5px;"]/a/img/@src')
        download = Downloader(link)
        cont = download.get_content()
        cont = unicode(cont)
        description = xpath.get(cont, '//div[@class="feature"]/p').strip()
        if description == "":
            description = "/"

        #VNIMANIE! NEMA KATEGORII
        category="/"
        subcategory="/"
        price = xpath.get(l, '//div[@style="margin-top:5px; margin-left:10px;height:155px; overflow:hidden;"]/h4/a')
        if price == u"Цена:По договор":
            value = "/"
            currency = "/"
        else:
            price = price.split(":")
            price = price[1]
            price = price.split(" ")
            value = price[0]
            if price[1]=="&euro;":
                currency = "EUR"
            elif price[1]=="ден.":
                currency = "MKD"
        date = xpath.get(l, '//b[@style="font-weight:bold;"]')
        date = date.split(": ")
        date = date[1]
        date = date.split(".")
        date = date[2]+"-"+date[1]+"-"+date[0]
        country = u"Македонија"

        region = xpath.get(cont, '//div[@style="float:left; width: 140px; overflow:hidden; font-family: Tahoma,Geneva,sans-serif; font-weight:bold"]')
        if region == "":
            region = "/"
        
        ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country)    
        ads.append(ad)
    return adsToJson(ads)
#print scrapeVipMarket5()
Beispiel #20
0
def parse(html, page):
    for i in html.split("<td></td></tr>"):
        ms = xpath.search(i, r"//a[@target='_parent']")
        txhash = ms[0] if len(ms) > 0 else ''
        fm = ms[1] if len(ms) > 1 else ''
        too = ms[2] if len(ms) > 2 else ''

        age = xpath.get(i, r"//span[@rel='tooltip']/@title")
        quantity = common.regex_get(i, r'>([\d\.\,]+)</td>$')

        info = '"' + '","'.join([txhash, age, fm, too, quantity]) + '"'
        infos.append(info)
    return infos
Beispiel #21
0
def process_bizhi_url(url):
    try:
        # print url
        data = urllib2.urlopen(url).read()
        down_obj = util.app_info({'market':market})
        down_obj['app_url'] = url
        down_obj['app_url_md5'] = hashlib.md5(url).hexdigest()
                
        app_name = xpath.search(data, '//div[@class="viewh"]/h1/')
        if(app_name):
            down_obj['app_name'] = app_name[0]
        
        infoleft = xpath.search(data, '//ul[@class="infoleft"]/li')
        if(infoleft):
            found = re.search('([.\d]+)([MK])', infoleft[1])
            if(found):
                if(found.group(2) == 'M'):
                    down_obj['size'] = int(float(found.group(1)) * 1024 * 1024)
                else:
                    # KB
                    down_obj['size'] = int(float(found.group(1)) * 1024)
                    
        short_url = xpath.search(data, '//div[@class="inforight"]/a/@href')
        if(short_url):
            opener = urllib2.build_opener(util.RedirectHandler)
            apk_url= opener.open(short_url[0]).geturl()
            down_obj['download_link'] = apk_url
            
            print down_obj
            util.sql_do(down_obj)
            util.put_job(down_obj)
            global cnt_all
            cnt_all += 1
        
    except urllib2.URLError, e:
        logger.error('process_url Exception at: %s, %s' % (url, e.message))
Beispiel #22
0
    def parse_html2(html):
        infos = []
        for i in html.split("<td></td></tr>"):
            ms = xpath.search(i, r"//span[@class='address-tag']")
            txhash = common.normalize(ms[0]) if len(ms) > 0 else ''
            fm = common.normalize(ms[1]) if len(ms) > 1 else ''
            too = common.normalize(ms[2]) if len(ms) > 2 else ''

            age = xpath.get(i, r"//span[@rel='tooltip']/@title")
            quantity = common.regex_get(i, r'>([\d\.\,]+)</td>$')
            direction = common.normalize(xpath.get(i, r'//span[@class="label\slabel.+"]'))

            if txhash:
                info = '"' + '","'.join([txhash, age, fm, direction, too, quantity]) + '"' 
                infos.append(info)
        return infos
Beispiel #23
0
def mal(mal_title, mal_id=False):
    cookies = {
        "incap_ses_224_81958":
        "P6tYbUr7VH9V6shgudAbA1g5FVYAAAAAyt7eDF9npLc6I7roc0UIEQ=="
    }
    response = requests.get(
        "http://myanimelist.net/api/anime/search.xml",
        params={'q': mal_title},
        cookies=cookies,
        auth=("zodman1", "zxczxc"),
        headers={
            'User-Agent':
            'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'
        })
    content = response.content
    if not mal_id is False:
        for e in xpath.search(content, "//entry"):
            if mal_id in e:
                content = e
                break

    tqdm.write("%s %s" % ((mal_title, ), mal_id))
    id = xpath.get(content, "//id")
    title = xpath.get(content, "//title")
    title_en = xpath.get(content, "//english")
    type_ = xpath.get(content, "//type")
    synonyms = xpath.get(content, "//synonyms")
    status = xpath.get(content, "//status")
    synopsys = translate(xpath.get(content, "//synopsis"), "es")
    img = xpath.get(content, "//image")
    episodes = xpath.get(content, "//episodes")
    resumen = synopsys.replace("&lt;br /&gt;", " ").replace("\n\r", "")
    resumen = translate(resumen, 'es')
    status = translate(status, 'es')
    assert id is not "", mal_title

    data = dict(title=title,
                title_en=title_en,
                type=type_,
                status=status,
                resumen=resumen,
                img=img,
                episodes=episodes,
                synonyms=synonyms,
                id=id,
                synopsys=synopsys)
    return MalResult(**data)
Beispiel #24
0
    def _loadFinished(self, result):  
        frame = self.mainFrame()  
        url = str(frame.url().toString())  
        html = frame.toHtml()
        html = unicode(html)  
        self.data[url] = xpath.search(html, self.xpathFilter)  
        self.crawl()  
        
# TEST STUB  
#urls = [u'http://www.vrapce.mk/ad/31515', u'http://www.vrapce.mk/ad/15389', u'http://www.vrapce.mk/ad/27998', u'http://www.vrapce.mk/ad/24257', u'http://www.vrapce.mk/ad/19107', u'http://www.vrapce.mk/ad/14938', u'http://www.vrapce.mk/ad/14093', u'http://www.vrapce.mk/ad/14287', u'http://www.vrapce.mk/ad/14285', u'http://www.vrapce.mk/ad/14095', u'http://www.vrapce.mk/ad/14283', u'http://www.vrapce.mk/ad/31674', u'http://www.vrapce.mk/ad/31501', u'http://www.vrapce.mk/ad/18958', u'http://www.vrapce.mk/ad/33154', u'http://www.vrapce.mk/ad/2306', u'http://www.vrapce.mk/ad/32088', u'http://www.vrapce.mk/ad/29153', u'http://www.vrapce.mk/ad/23524', u'http://www.vrapce.mk/ad/20304', u'http://www.vrapce.mk/ad/4108', u'http://www.vrapce.mk/ad/22328', u'http://www.vrapce.mk/ad/3279', u'http://www.vrapce.mk/ad/13233', u'http://www.vrapce.mk/ad/2827', u'http://www.vrapce.mk/ad/24813', u'http://www.vrapce.mk/ad/18957', u'http://www.vrapce.mk/ad/5466', u'http://www.vrapce.mk/ad/31556', u'http://www.vrapce.mk/ad/29668']  
# url = [u'http://www.vrapce.mk/']
# urls = []
# r = MultiPageFilterRenderer(url, '//a[@class="advertImage3Inner"]/@href')  
# urls = r.data['http://www.vrapce.mk/'] 
# print urls
# description = 
#     print description
Beispiel #25
0
def incr_database(conn):
    # csi
    D = download.Download(delay=0, read_cache=None, write_cache=None)
    data = []
    csi = []
    src = 'http://www.csindex.com.cn/zh-CN/indices/index-detail/'
    for i in open('stocks.csv'):
        code = i.split('\t')[0]
        if 'CSI' in i or '000985' in i:
            url = src + code
            html = D.get(url)
            trddate = common.regex_get(html, r'截止日期:([^<]+)<')
            if trddate:
                trddate = trddate.replace('-', '')
            m = xpath.search(html,
                             r'//table[@class="table\stc"]/tr/td',
                             remove=None)
            close = m[0] if m else None
            change = m[1] if m and len(m) > 1 else None
            sql = ''' 
                     REPLACE INTO quote_csi(code, close, date, chg) VALUES('%s',%s,%s,%s);
            ''' % (code, close, trddate, change)
            conn.execute(sql)
        else:
            today = datetime.today().strftime('%Y-%m-%d')
            engine = create_engine(
                'mysql://*****:*****@localhost:3306/dige', echo=False)
            try:
                df = ts.get_k_data(code,
                                   ktype='D',
                                   index=True,
                                   start=today,
                                   end=today)
                if not df.empty:
                    sql = ''' delete from quote_nocsi where code like '%%%s%%' and date = '%s' ''' % (
                        code, today)
                    conn.execute(sql)
                    df.to_sql('quote_nocsi', engine, if_exists='append')
            except Exception, e:
                print e
Beispiel #26
0
def search_animenetwork(title):
    base_url="http://cdn.animenewsnetwork.com/encyclopedia/api.xml"
    params = {'anime':"~"+title}
    response = requests.get(base_url, params=params)
    animes = xpath.search(response.content,"//anime")
    l = []
    for i in animes:
        id = xpath.search(i, "./@id").pop()
        images = xpath.search(i, "//info/img/@src")
        summary = xpath.get(i, "//info[@type='Plot Summary']")
        genres = xpath.search(i, "//info[@type='Genres']")
        openings = xpath.search(i, "//info[@type='Opening Theme']")
        endings = xpath.search(i, "//info[@type='Ending Theme']")
        d={'summary': summary, 'images':images, 'genres': genres,
                'openings': openings,'endings': endings,'id':id
            }
        l.append(d)
    return l
Beispiel #27
0
def mal_search(mal_title, mal_id=False):
    cookies = {"incap_ses_224_81958":"P6tYbUr7VH9V6shgudAbA1g5FVYAAAAAyt7eDF9npLc6I7roc0UIEQ=="}
    response = requests.get(
        "http://myanimelist.net/api/anime/search.xml",
        params={'q':mal_title},
        cookies=cookies,
        auth=("zodman1","zxczxc"),
        headers = {'User-Agent':'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36'}
         )
    content = response.content
    if mal_id is not False:
        for e in xpath.search(content,"//entry"):
            if  mal_id in e:
                content = xpath.get(e, "//anime/entry")
                break
    else:
        content = xpath.get(content, "//anime/entry")
    english_title = xpath.get(content, '//english')
    title = xpath.get(content, '//title')
    synonyms = xpath.get(content, '//synonyms')
    id = xpath.get(content, "//id")
    return {'title':title, 'english_title':english_title, 'synonyms': synonyms,
            'id':id
            }
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 11 10:41:19 2015

@author: justin.malinchak
"""

from webscraping import download, xpath

D = download.Download()

html = D.get('https://www.hedgefundresearch.com/hfrx_reg/index.php')
for row in xpath.search(
        html, '//table[@class="spad"]/tbody/t'
):  #xpath.search(html, '<b class=tenpx>HFRX Global Hedge Fund Index</b></TD>'):
    cols = xpath.search(row, '/td')
    print 'Sunrise: %s, Sunset: %s' % (cols[1], cols[2])
Beispiel #29
0
def scrapeMobileBg():
    # cp1251 support
    reload(sys)
    sys.setdefaultencoding('cp1251')
    now = datetime.now()
    down = Downloader('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71wxzy&f1=1')
    #http://www.mobile.bg/71ydeh
    #http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71wxzy&f1=1
    content = down.get_content()
    html = unicode(content)
    linkovi = xpath.search(html, '//form[@name="search"]/table[@class="tablereset"]')
    linkovi = linkovi[3:len(linkovi)-4]
    
    links = []
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xw69&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xwi1&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xwr0&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xx7g&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xxjy&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71xzyr&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y06e&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y0dk&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y0q6&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y16v&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y1ep&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y2ih&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y2x5&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y34p&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y3ex&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y3wj&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y449&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y4wz&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y5qh&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y5yv&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y6az&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y6kg&f1=1')
    links.append('http://www.mobile.bg/pcgi/mobile.cgi?act=3&slink=71y6qz&f1=1')
     
    for link in links:
        dole = Downloader(link)
        content = dole.get_content()
        html = unicode(content)
        lin = xpath.search(html, '//form[@name="search"]/table[@class="tablereset"]')
        lin = lin[3:len(lin)-4]
        for li in lin:
            linkovi.append(li)
            linkot = xpath.get(li, '//td[@class="valgtop"]/a[@class="mmm"]/@href')

    ads = []
    for l in linkovi:
        link = xpath.get(l, '//td[@class="valgtop"]/a[@class="mmm"]/@href')
        title = xpath.get(l, '//td[@class="valgtop"]/a[@class="mmm"]').strip()
        imageUrl = xpath.get(l, '//a[@class="photoLink"]/img/@src')
        download = Downloader(link)
        cont = download.get_content()
        cont = unicode(cont)
        description = xpath.get(cont, '//td[@style="font-size:13px;"]').strip()
        description = description.split("<a href")
        description = description[0]
        if description == "» ":
            description = "/"
        else:
            description = description[0:len(description)-19]
        description = description = description.replace("\"", "")
        category = u"Возила"
        subcategory = "/"
        price = xpath.get(l, '//span[@class="price"]').strip()
        if price == u"Договаряне":
            value = "/"
            currency = "/"
        else:
            price = price.split(" ")
            if len(price)==2:
                value = price[0]
                currency = price[1]
            elif len(price)==3:
                currency = price[2]
                value = price[0]+price[1]
            else:
                currency = price[3]
                value = price[0]+price[1]+price[2]
            if currency == "лв.":
                currency = "BGN"        
        region = xpath.get(cont, '//td[@style="padding:10px"]').strip()
        region = region.split("Регион: ")
        region = region[1]
        region = region.split(" ")
        region = region[0]
        region = region.replace("<a","").strip()
        date = str(now.year)+"-"+str(now.month)+"-"+str(now.day)
        country = u"Бугарија"
        
        ad = Ad(link, title, imageUrl, description, category, subcategory, value, currency, region, date, country)    
        ads.append(ad)
    return adsToJson(ads)
#print scrapeMobileBg()
writer = common.UnicodeWriter('articles.csv')

writer.writerow(['Title', 'Num reads', 'URL'])

seen_urls = set(
)  # track which articles URL's already seen, to prevent duplicates

D = download.Download()

# iterate each of the categories
for category_link in ('/developer/knowledge‐base?page=%d',
                      '/developer/articles?page=%d'):
    # iterate the pages of a category
    for page in itertools.count():
        category_html = D.get(urlparse.urljoin(DOMAIN, category_link % page))
        article_links = xpath.search(category_html,
                                     '//div[@class="morelink"]/a/@href')
        num_new_articles = 0
        for article_link in article_links:
            # scrape each article
            url = urlparse.urljoin(DOMAIN, article_link)
            if url not in seen_urls:
                num_new_articles += 1
                seen_urls.add(url)
                html = D.get(url)
                title = xpath.get(html, '//div[@class="feed‐header‐wrap"]/h2')
                num_reads = xpath.get(
                    html,
                    '//li[@class="statistics_counter last"]/span').replace
                row = title, num_reads, url
                writer.writerow(row)
        if num_new_articles == 0:
Beispiel #31
0
def scrapeReklama5():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    down = Downloader('https://www.reklama5.mk/Search')
    html = down.get_content()

    html = unicode(html)

    requestedWebPageUrl = 'https://www.reklama5.mk'

    adverts = xpath.search(html, '//div[@class="OglasResults"]')
    ads = []
    for advert in adverts:
        link = requestedWebPageUrl + xpath.get(
            advert, '//a[@class="SearchAdTitle"]/@href')

        title = xpath.get(
            advert,
            '//a[@class="SearchAdTitle"].text()').strip().replace("\"", "")

        description = getDescription(
            link, '//div[@class="oglasTitle"]/p[@class="oglasTitle"]').strip(
            ).replace("\"", "")

        subcategory = "/"
        imageUrl = xpath.get(advert, '//img[@class="thumbnail thumbs"]/@src')
        if imageUrl == "/Content/images/noImage2.jpg":
            imageUrl = requestedWebPageUrl + imageUrl

        price = xpath.get(advert, '//div[@class="text-left text-success"]')
        price = re.sub('\s+', ' ', price).strip()
        price = price.split(" ")

        if price[0] == "По":
            price[0] = "/"
        if price[1] == "Договор":
            price[1] = "/"

        value = price[0]
        currency = price[1]
        if currency == "€":
            currency = "EUR"
        if currency == u"МКД":
            currency = "MKD"
        region = xpath.get(advert, '//p[@class="clear-margin"]')
        region = region.split("&gt;")
        region = region[0].strip()
        country = u"Македонија"
        date = xpath.get(advert,
                         '//div[@class="text-center clear-padding adDate"]')
        date = re.sub('\s+', ' ', date).strip()
        time = xpath.get(advert,
                         '//div[@class="text-center clear-padding adDate"]')
        time = re.sub('\s+', ' ', time).strip()
        if date.split()[0] == u"Денес" and time.split()[0]:
            date = datetime.now()
            datum = str(date.year) + "-" + str(date.month) + "-" + str(
                date.day)
            vreme = time.split(" ")[1]
            p = datum + " " + vreme
            date = p
        category = xpath.get(advert, '//p[@class="adCategoryName"]/a')

        ad = Ad(link, title, imageUrl, description, category, subcategory,
                value, currency, region, date, country)
        #print link, title, imageUrl, description, category, subcategory, value, currency, region, date
        ads.append(ad)

    return adsToJson(ads)
Beispiel #32
0
def scrapeMobile24():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    now = datetime.now()
    #http://www.mobile24.mk/avtomobili/'
    down = Downloader('http://www.mobile24.mk/avtomobili/')
    content = down.get_content()
    html = unicode(content)
    linkovi = xpath.search(html, '//tr[@class="t0"]')
    lin = xpath.search(html, '//tr[@class="t1"]')
    for l in lin:
        linkovi.append(l)
    #http://www.mobile24.mk/motocikli/
    down = Downloader('http://www.mobile24.mk/motocikli/')
    content = down.get_content()
    html = unicode(content)
    linko = xpath.search(html, '//tr[@class="t0"]')
    lin = xpath.search(html, '//tr[@class="t1"]')
    for l in lin:
        linkovi.append(l)
    for l in linko:
        linkovi.append(l)
    #http://www.mobile24.mk/kombinja/
    down = Downloader('http://www.mobile24.mk/kombinja/')
    content = down.get_content()
    html = unicode(content)
    linko = xpath.search(html, '//tr[@class="t0"]')
    lin = xpath.search(html, '//tr[@class="t1"]')
    for l in lin:
        linkovi.append(l)
    for l in linko:
        linkovi.append(l)
    #http://www.mobile24.mk/kamioni/
    down = Downloader('http://www.mobile24.mk/kamioni/')
    content = down.get_content()
    html = unicode(content)
    linko = xpath.search(html, '//tr[@class="t0"]')
    lin = xpath.search(html, '//tr[@class="t1"]')
    for l in lin:
        linkovi.append(l)
    for l in linko:
        linkovi.append(l)
    #http://www.mobile24.mk/prikolki/
    down = Downloader('http://www.mobile24.mk/prikolki/')
    content = down.get_content()
    html = unicode(content)
    linko = xpath.search(html, '//tr[@class="t0"]')
    lin = xpath.search(html, '//tr[@class="t1"]')
    for l in lin:
        linkovi.append(l)
    for l in linko:
        linkovi.append(l)
    #http://www.mobile24.mk/avtobusi/
    down = Downloader('http://www.mobile24.mk/avtobusi/')
    content = down.get_content()
    html = unicode(content)
    linko = xpath.search(html, '//tr[@class="t0"]')
    lin = xpath.search(html, '//tr[@class="t1"]')
    for l in lin:
        linkovi.append(l)
    for l in linko:
        linkovi.append(l)
    #http://www.mobile24.mk/gumiiavtodelovi/
    down = Downloader('http://www.mobile24.mk/gumiiavtodelovi/')
    content = down.get_content()
    html = unicode(content)
    linko = xpath.search(html, '//tr[@class="t0"]')
    lin = xpath.search(html, '//tr[@class="t1"]')
    for l in lin:
        linkovi.append(l)
    for l in linko:
        linkovi.append(l)

    ads = []
    for l in linkovi:
        link = xpath.get(l, '//a[@class="listing-title"]/@href')
        title = xpath.get(l, '//a[@class="listing-title"]/b')
        imageUrl = xpath.get(l, '//td[@class="image"]/a/img/@src')
        download = Downloader(link)
        cont = download.get_content()
        cont = unicode(cont)
        desc = xpath.search(
            cont,
            '//div[@class="item-left"]/div[@class="fieldset rounded4"]/div')
        if len(desc) == 4:
            description = desc[1]
        else:
            description = desc[0]
        category = u"Возила"
        subcategory = "/"
        price = xpath.get(l, '//td[@class="price"].text()')
        value = xpath.get(l, '//td[@class="price"]/span')
        value = value.replace(",", "")
        price = price.split("span>")
        price = price[2]
        price = price.split("<")
        price = price[0]
        currency = price
        if currency == u"денари":
            currency = "MKD"
        if value == u"По договор":
            value = "/"
            currency = "/"
        region = xpath.get(l, '//span[@class="city"]')
        date = str(now.year) + "-" + str(now.month) + "-" + str(now.day)
        country = u"Македонија"

        ad = Ad(link, title, imageUrl, description, category, subcategory,
                value, currency, region, date, country)
        ads.append(ad)
    return adsToJson(ads)


# print scrapeMobile24()
import json
import csv
import sys
import codecs

#Download instance
D = download.Download()

#get page
html = D.get('http://2015.es.pycon.org/es/schedule/')

index = 0
talks_pycones = []

#get div where is located information
for row in xpath.search(html, '//div[@class="col-xs-12"]'):

    if index % 2 == 0:
        talk = xpath.search(row, '//div[@class="slot-inner"]/h3')

        author = xpath.search(row, '//div[@class="slot-inner"]/p/strong')

        hour = xpath.search(row, '//div[@class="slot-inner"]/strong')

    if index % 2 != 0:
        description = xpath.search(row, '/p')

        if talk is not None and author is not None and description is not None and hour is not None and len(
                talk) > 0 and len(author) > 0 and len(description) > 0 and len(
                    hour) > 0:
            talk_pycones = {}
Beispiel #34
0
def scrapePobarajOglasi():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    now = datetime.now()
    down = Downloader('http://www.pobaraj.com.mk/lista_na_oglasi/all/1')
    content = down.get_content()
    html = unicode(content)
    site = xpath.get(html, '//ul[@class="lista_na_oglasi"]')
    linkovi = xpath.search(site, '//li')
    ads = []
    for l in linkovi:
        link = "http://www.pobaraj.com.mk" + xpath.get(
            l, '//a[@class="title"]/@href')
        title = xpath.get(l, '//a[@class="title"]')
        imageUrl = xpath.get(l, '//a[@class="photo"]/img/@src')
        download = Downloader(link)
        cont = download.get_content()
        cont = unicode(cont)
        description = xpath.get(cont,
                                '//div[@class="oglas_prikaz_opis"]').strip()
        if description == "":
            description = "/"
        kategorii = xpath.search(cont, '//a[@class="pateka"]')
        category = kategorii[1]
        if len(kategorii) > 2:
            subcategory = kategorii[2]
        else:
            subcategory = "/"
        price = xpath.get(l, '//div[@class="price"]').strip()
        price = price.split("<div ")
        price = price[0].strip()
        price = price.split("Цена: ")
        price = price[1]
        if price == u"по договор":
            value = "/"
            currency = "/"
        else:
            price = price.split(" ")
            value = price[0]
            if price[1] == u"денари":
                currency = "MKD"
            elif price[1] == u"евра":
                currency = "EUR"
            else:
                currency = price[1]
        region = xpath.get(cont, '//div[@class="oglas_prikaz_left"]').strip()
        region = region.split("Град:<")
        region = region[1]
        region = region.split("<b class")
        region = region[0]
        region = region.split("b>")
        region = region[1]
        region = region.strip()
        country = u"Македонија"

        datum = xpath.get(l, '//div[@class="oglas_date"]').strip()
        datum = datum.split(": ")
        datum = datum[1]
        datum = datum.split(", ")
        vreme = datum[1]
        datum = datum[0]
        if datum == u"Денес":
            date = str(now.year) + "-" + str(now.month) + "-" + str(
                now.day) + " " + vreme
        elif datum == u"Вчера":
            da = datetime.now() - timedelta(days=1)
            date = str(da.year) + "-" + str(da.month) + "-" + str(
                da.day) + " " + vreme
        else:
            datum = datum.split(" ")
            if datum[1] == "Јан":
                datum = str(now.year) + "-1-" + datum[0]
            elif datum[1] == "Фев":
                datum = str(now.year) + "-2-" + datum[0]
            elif datum[1] == "Мар":
                datum = str(now.year) + "-3-" + datum[0]
            elif datum[1] == "Апр":
                datum = str(now.year) + "-4-" + datum[0]
            elif datum[1] == "Мај":
                datum = str(now.year) + "-5-" + datum[0]
            elif datum[1] == "Јун":
                datum = str(now.year) + "-6-" + datum[0]
            elif datum[1] == "Јул":
                datum = str(now.year) + "-7-" + datum[0]
            elif datum[1] == "Авг":
                datum = str(now.year) + "-8-" + datum[0]
            elif datum[1] == "Сеп":
                datum = str(now.year) + "-9-" + datum[0]
            elif datum[1] == "Окт":
                datum = str(now.year) + "-10-" + datum[0]
            elif datum[1] == "Ное":
                datum = str(now.year) + "-11-" + datum[0]
            elif datum[1] == "Дек":
                datum = str(now.year) + "-12-" + datum[0]
            date = datum + " " + vreme
        ad = Ad(link, title, imageUrl, description, category, subcategory,
                value, currency, region, date, country)
        ads.append(ad)
    return adsToJson(ads)


#print scrapePobarajOglasi()
Beispiel #35
0
def scrapeKoli():
    reload(sys)
    sys.setdefaultencoding('utf-8')
    down = Downloader('http://koli.com.mk/polovni_lista.aspx')
    html = down.get_content()
    html = unicode(html)
    requestedWebPageUrl = 'http://koli.com.mk/polovni_lista.aspx'
    adverts = xpath.search(html, '//table[@id="dlRezultati"]')
    ads = []
    links = xpath.search(html, '//a[@class="linkovi_desno_golemi"]/@href')
    da = datetime.now()
    for l in links:
        link = "http://koli.com.mk/" + l
        d = Downloader(link)
        ad = d.get_content()
        ad = unicode(ad)
        description = u"Опрема: " + xpath.get(
            ad, '//span[@id="lblOprema"]') + " \nOpis: " + xpath.get(
                ad, '//span[@id="lblOpis"]')
        title = xpath.get(ad, '//span[@id="lblMarkaModel"].text()').strip()
        imageUrl = 'http://koli.com.mk/' + xpath.get(
            ad, '//img[@id="slika"]/@src')
        subcategory = "/"
        category = u"Возила"
        region = xpath.get(ad, '//span[@id="lblGrad"].text()')
        country = u"Македонија"
        value = xpath.get(ad, '//span[@id="lblMomentalnaCena"]').strip()
        currency = "EUR"
        date = ""
        d = xpath.get(ad, '//span[@id="lblDenovi"]').strip()
        d = d.split(" ")
        if len(d) == 1:
            if d[0] == u"минута":
                date = str(da.year) + "-" + str(da.month) + "-" + str(da.day)
            elif d[0] == u"час":
                date = str(da.year) + "-" + str(da.month) + "-" + str(da.day)
            elif d[0] == u"ден":
                da = datetime.now() - timedelta(days=1)
                date = str(da.year) + "-" + str(da.month) + "-" + str(da.day)
            elif d[0] == u"месец":
                da = datetime.now() - timedelta(days=30)
                date = str(da.year) + "-" + str(da.month) + "-" + str(da.day)
            elif d[0] == u"секунда":
                date = str(da.year) + "-" + str(da.month) + "-" + str(da.day)
        else:
            if d[1] == u"месеци":
                da = datetime.now() - timedelta(days=int(d[0] * 30))
                date = str(da.year) + "-" + str(da.month) + "-" + str(da.day)
            elif d[1] == u"дена":
                da = datetime.now() - timedelta(days=int(d[0]))
                date = str(da.year) + "-" + str(da.month) + "-" + str(da.day)
            elif d[1] == u"минути":
                date = str(da.year) + "-" + str(da.month) + "-" + str(da.day)
            elif d[1] == u"часа":
                date = str(da.year) + "-" + str(da.month) + "-" + str(da.day)
            elif d[1] == u"секунди":
                date = str(da.year) + "-" + str(da.month) + "-" + str(da.day)

        ad = Ad(link, title, imageUrl, description, category, subcategory,
                value, currency, region, date, country)
        ads.append(ad)

    return adsToJson(ads)
import json
import csv
import sys
import codecs


#Download instance
D = download.Download()

#get page
html = D.get('http://pydata.org/madrid2016/schedule/')

talks_pydata = []

#get td element where is located information
for row in xpath.search(html, '//td[@class="slot slot-talk"]'):
	
	speakers = xpath.search(row,'//span[@class="speaker"]/text()')
	urls = xpath.search(row,'//span[@class="title"]//a/@href')
	talks = xpath.search(row,'//span[@class="title"]//a/text()')	
	for speaker in speakers:
		print speaker.strip()
		print urls[0]
		print talks[0]
		details = D.get('http://pydata.org/'+urls[0])
		description = xpath.search(details,'//div[@class="description"]//p/text()')[0]
		print description
		hour = xpath.search(details,'//div[@class="col-md-8"]//h4/text()')[0].replace("\n","").strip()
		print hour
		
		if talks[0] is not None and speaker is not None and description is not None and hour is not None:
csv_file = csv.reader(open('urls-test2.csv', 'rb'), delimiter=',')

names = []
for data in csv_file:
    names.append(data[0])

for name in names:
   html = D.get(name);
   html2 = html
   param = '<br />';
   html2 = html2.replace("<br />", " | ")
   print name

   c = csv.writer(open("darkgrey.csv", "a"))
   for row in xpath.search(html2, '//table/tr[@class="bgdarkgrey"]'):
       cols = xpath.search(row, '/td')
       if len(cols) >= 5:
           c.writerow([cols[0], cols[1], cols[2], cols[3], cols[4]])

   q = csv.writer(open("lightgrey.csv", "a"))
   for row2 in xpath.search(html2, '//table/tr[@class="bglightgrey"]'):
       cols2 = xpath.search(row2, '/td')
       if len(cols) >= 5:
           q.writerow([cols2[0], cols2[1], cols2[2], cols2[3], cols2[4]])

csv_file.close()
import csv, sys
from webscraping import download, xpath
D = download.Download()
 def get_followees():
     # 构造关注链接,寻找下一层user
     user_url_followees = user_url + '/followees'
     user_url_followers = user_url + '/followers'
     try:
         # 获取关注列表页面
         html_followees = D.get(user_url_followees, delay=0.1, opener=opener, read_cache=False, write_cache=False)
         # 获取关注者列表页面
         html_followers = D.get(user_url_followers, delay=0.1, opener=opener, read_cache=False, write_cache=False)
     except Exception, e:
         print 'Exception in download. {}'.format(str(e))
     else:
         if html_followees and html_followers:
             # xpath解析页面
             # 解析followees
             followees_list = xpath.search(html_followees, '//div[@class="zh-general-list clearfix"]//div[@class="zm-profile-card zm-profile-section-item zg-clear no-hovercard"]//h2[@class="zm-list-content-title"]')
             for i in range(len(followees_list)):
                 # 获取链接写入zhihu_url_main集合
                 zhihu_url_main.put(common.regex_get(followees_list[i], r'href="(.*?)" '))
             # 解析followers
             followers_list = xpath.search(html_followers, '//div[@class="zh-general-list clearfix"]//div[@class="zm-profile-card zm-profile-section-item zg-clear no-hovercard"]//h2[@class="zm-list-content-title"]')
             for i in range(len(followers_list)):
                 # 获取链接写入zhihu_url_main集合
                 zhihu_url_main.put(common.regex_get(followers_list[i], r'href="(.*?)" '))
 # 执行获取函数
 get_followees()
 # 从main集合中抽取url,保证是没有读取过的,即不在copy集合中
 new_user_url = zhihu_url_main.get()
 while zhihu_url_copy.ismember(new_user_url) == 1:
     new_user_url = zhihu_url_main.get()
 # 存入copy集合
Beispiel #39
0
def scrapeHaloOglasi():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    now = datetime.now()
    down = Downloader(
        'http://www.halooglasi.com/naslovna.240.html?search_text=&sortColumn=VremeDodavanja'
    )
    content = down.get_content()
    html = unicode(content)
    celo = xpath.get(html, '//div[@class="results_container"]')
    linkovi = xpath.search(celo, '//div[@class="result_brza"]')
    ads = []
    for l in linkovi:
        link = xpath.get(l, '//div[@style="height:auto;"]/h2/a/@href')
        link = "http://www.halooglasi.com" + link
        download = Downloader(link)
        cont = download.get_content()
        cont = unicode(cont)
        title = xpath.get(cont, '//div[@class="detail_bar_nek"]/h2').strip()
        if title == "":
            title = xpath.get(cont, '//div[@class="detail_bar"]/h2').strip()
        imageUrl = xpath.get(l, '//a[@class="thumb"]/img/@src')
        imageUrl = "http://www.halooglasi.com" + imageUrl

        description = xpath.get(l, '//div[@class="text_ogl"]/p')

        kategorija = xpath.get(l, '//div[@class="brza_link"]').strip()
        kategorija = kategorija.split("\r\n\t\t\t\t\t\t\r\n\t\t\t\t\t\t")
        kategorija = kategorija[1]
        kategorija = kategorija.split(" > ")
        category = kategorija[0]
        if len(kategorija) > 2:
            subcategory = kategorija[1]
        else:
            kategorija = kategorija[1].split("'>")
            kategorija = kategorija[1]
            kategorija = kategorija.split("<")
            subcategory = kategorija[0]
        price = xpath.get(cont, '//div[@class="price"]').strip()  #price
        if price == "":
            price = xpath.get(
                cont, '//div[@class="price deal"]').strip()  #price deal
        price = price.replace(".", "")
        price = price.replace("din", " DIN")
        price = price.replace("&euro;", " EUR")
        if price == "Dogovor":
            value = "/"
            currency = "/"
        else:
            price = price.split(" ")
            value = price[0]
            currency = price[1]
        date_loc = xpath.search(l, '//div[@class="datum_grad"]/h6/span')
        date_loc[0] = date_loc[0].strip()
        date = date_loc[0].split("\r\n")
        date = date[0]
        date = date.replace(".", "")
        date = date.split(" ")
        if date[1] == "Jan":
            date[1] = "1"
        elif date[1] == "Feb":
            date[1] = "2"
        elif date[1] == "Mar":
            date[1] = "3"
        elif date[1] == "Apr":
            date[1] = "4"
        elif date[1] == "Maj":
            date[1] = "5"
        elif date[1] == "Jun":
            date[1] = "6"
        elif date[1] == "Jul":
            date[1] = "7"
        elif date[1] == "Avg":
            date[1] = "8"
        elif date[1] == "Sep":
            date[1] = "9"
        elif date[1] == "Okt":
            date[1] = "10"
        elif date[1] == "Nov":
            date[1] = "11"
        elif date[1] == "Dec":
            date[1] = "12"
        date = date[2] + "-" + date[1] + "-" + date[0]
        l = date_loc[1].strip()
        l = l.split("&nbsp;")
        region = l[0]
        country = u"Србија"

        ad = Ad(link, title, imageUrl, description, category, subcategory,
                value, currency, region, date, country)
        ads.append(ad)

    return adsToJson(ads)


#print scrapeHaloOglasi()
csv_file = csv.reader(open('urls-test2.csv', 'rb'), delimiter=',')

names = []
for data in csv_file:
    names.append(data[0])

for name in names:
    html = D.get(name)
    html2 = html
    param = '<br />'
    html2 = html2.replace("<br />", " | ")
    print name

    c = csv.writer(open("darkgrey.csv", "a"))
    for row in xpath.search(html2, '//table/tr[@class="bgdarkgrey"]'):
        cols = xpath.search(row, '/td')
        if len(cols) >= 5:
            c.writerow([cols[0], cols[1], cols[2], cols[3], cols[4]])

    q = csv.writer(open("lightgrey.csv", "a"))
    for row2 in xpath.search(html2, '//table/tr[@class="bglightgrey"]'):
        cols2 = xpath.search(row2, '/td')
        if len(cols) >= 5:
            q.writerow([cols2[0], cols2[1], cols2[2], cols2[3], cols2[4]])

csv_file.close()
import csv, sys
from webscraping import download, xpath
D = download.Download()
Beispiel #41
0
def scrapeKupujemProdajem():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    now = datetime.now()
    ads = []
    try:
        down = Downloader(
            'http://www.kupujemprodajem.com/search.php?action=list&data[category_id]=&data[group_id]=&data[location_id]=&data[keywords]=&submit[search]=Tra%C5%BEi'
        )
        content = down.get_content()
        html = unicode(content)
        link = ""
        title = ""
        imageUrl = ""
        description = "/"
        category = "/"
        subcategory = "/"
        value = "/"
        currency = "/"
        region = "/"
        date = str(now.year) + "-" + str(now.month) + "-" + str(now.day)
        linkovi = xpath.search(html, '//div[@class="item clearfix"]')
        highlighted = xpath.search(
            html, '//div[@class="item clearfix adHighlighted"]')
        for h in highlighted:
            linkovi.append(h)
        for l in linkovi:
            try:
                link = "http://www.kupujemprodajem.com/" + xpath.get(
                    l, '//a[@class="adName"]/@href')
                title = xpath.get(l, '//a[@class="adName"]')
                region = xpath.get(l,
                                   '//section[@class="locationSec"]').strip()
                region = region.split(" | ")
                region = region[0]
                price = xpath.get(l, '//span[@class="adPrice"]')
                price = price.split("&nbsp;")
                if len(price) == 2:
                    value = price[0]
                    value = value.replace(".", "")
                    value = value.split(",")
                    value = value[0]
                    currency = price[1]
                else:
                    value = "/"
                    currency = "/"

                if currency == "&euro;":
                    currency = "EUR"
                elif currency == "din":
                    currency = "DIN"

                down = Downloader(link)
                content = down.get_content()

                category = xpath.get(content, '//a[@class="crumbs"]')
                description = xpath.get(
                    l, '//section[@class="nameSec"]/p[@class="adDescription"]')
                category = category.split("|")
                category = category[0]
                category = category.strip()
                imageUrl = xpath.get(
                    content, '//div[@class="adThumbnailHolder"]/a/img/@src')
                imageUrl = imageUrl.replace("//", "/")
                imageUrl = imageUrl[1::]
                if imageUrl == "":
                    imageUrl = "/"
                description = description.replace("...<p>", "")
                description = description.strip()
                country = u"Србија"
                ad = Ad(link, title, imageUrl, description, category,
                        subcategory, value, currency, region, date, country)
                ads.append(ad)
            except:
                pass
    except:
        pass

    return adsToJson(ads)


#print scrapeKupujemProdajem()
from webscraping import download, xpath
import json
import csv
import sys
import codecs

#Download instance
D = download.Download()

#get page
html = D.get('http://pydata.org/madrid2016/schedule/')

talks_pydata = []

#get td element where is located information
for row in xpath.search(html, '//td[@class="slot slot-talk"]'):

    speakers = xpath.search(row, '//span[@class="speaker"]/text()')
    urls = xpath.search(row, '//span[@class="title"]//a/@href')
    talks = xpath.search(row, '//span[@class="title"]//a/text()')
    for speaker in speakers:
        print speaker.strip()
        print urls[0]
        print talks[0]
        details = D.get('http://pydata.org/' + urls[0])
        description = xpath.search(details,
                                   '//div[@class="description"]//p/text()')[0]
        print description
        hour = xpath.search(details,
                            '//div[@class="col-md-8"]//h4/text()')[0].replace(
                                "\n", "").strip()
Beispiel #43
0
def scrapeAvtooglasi():
    # UTF-8 support
    reload(sys)
    sys.setdefaultencoding('utf-8')
    now = datetime.now()
    down = Downloader('http://www.avtooglasi.com.mk/rezultati/show/?vid=0&orderby=0')
    content = down.get_content()
    html = unicode(content)
    sliki = xpath.search(html, '//div[@class="resultLeft"]')
    ostanato = xpath.search(html, '//div[@class="oglasInfoTopContent"]')
    ceni = xpath.search(html, '//a[@class="btn btn-info btn-xs oglasInfoAdditionalPrice"]')
    
    link = {}
    title = {}
    imageUrl = {}
    description = {}
    category = {}
    subcategory = {}
    value = {}
    currency = {}
    region =  {}
    date = {}
    
    i = 0
    
    ads = []
    for slika in sliki:
        imageUrl[i] = xpath.search(slika, '//a[@class="thumbnail resultImg"]/img/@src')[0]
        i = i + 1
    
    i = 0
    
    for cena in ceni:
        price = xpath.get(cena,'//span/span').strip()
        price=price.split(" ")
        if len(price)>1:
            if price[0]=="По":
                price[0]="/"
            if price[1]=="договор":
                price[1]="/"
        
            value[i]=price[0]
            currency[i]=price[1]
            if currency[i]=="&euro;":
                currency[i]="EUR"  
        i = i + 1
        
    i = 0
    
    for advert in ostanato:
        link[i] = xpath.get(advert, '//a[@class="resultMainLink"]/@href')       
        title[i] = xpath.get(advert, '//a[@class="resultMainLink"]/span').strip().replace("\"", "")
        
        path = xpath.search(getDescription(link[i],'//div[@class="centerC"]'), '/div/div[@class="padded"]')
        description[i] = path[1]
        subcategory[i]="/"    
        category[i] = u"Возила"
        
        dodatok = xpath.get(advert, '//span[@class="oglasInfoAdditionalInfo"]')
        dodatok = dodatok.split(" | ")
        region[i] = dodatok[0]
        country = u"Македеонија"
        description[i] = dodatok[1] + u" година, "+ dodatok[2] +", "+ dodatok[3] +", "+ dodatok[4] +", "+ dodatok[5] +", "+ description[i]
        description[i] = description[i].strip().replace("\"", "")

        date[i]=""
        #print description[i]
        datum = dodatok[6].strip()
        datum = datum.split(" ")
        if datum[0]=="Денес":
            datum [0]= str(now.year)+"-"+str(now.month)+"-"+str(now.day)
            date[i]=datum[0]+" "+datum[2]
        elif datum[0]=="Вчера":
            da=datetime.now()-timedelta(days=1)
            datum[0]=str(da.year)+"-"+str(da.month)+"-"+str(da.day)
            date[i]=datum[0]+" "+datum[2]
        elif datum[0]=="пред":
            if datum[2]=="дена":
                da=datetime.now()-timedelta(days=int(datum[1]))
                datum[0]=str(da.year)+"-"+str(da.month)+"-"+str(da.day)
                date[i]=datum[0]
            else:
                if datum[1]=="1":
                    da=datetime.now()-timedelta(days=30)
                    datum[0]=str(da.year)+"-"+str(da.month)+"-"+str(da.day)
                    date[i]=datum[0]
                else:
                    da=datetime.now()-timedelta(days=60)
                    datum[0]=str(da.year)+"-"+str(da.month)+"-"+str(da.day)
                    date[i]=datum[0]
        else:
            date[i]=datum[0]+" "+datum[1]

        #print date[i]
        i = i + 1
        
    for i in link:
        ad = Ad(link[i], title[i], imageUrl[i], description[i], category[i], subcategory[i], value[i], currency[i], region[i], date[i], country)    
        ads.append(ad)
        
    return adsToJson(ads)

# print scrapeAvtooglasi()
from webscraping import download, xpath
D = download.Download()

html = D.get('www.abv.bg')
for row in xpath.search(html, '//table[@class="spad"]/tbody/tr'):
    cols = xpath.search(row, '/td')
    print 'Sunrise: %s, Sunset: %s' % (cols[1], cols[2])
import csv
import sys
import codecs


#Download instance
D = download.Download()

#get page
html = D.get('http://2015.es.pycon.org/es/schedule/')

index =0
talks_pycones = []

#get div where is located information
for row in xpath.search(html, '//div[@class="col-xs-12"]'):
    
    if index%2 ==0:
        talk = xpath.search(row, '//div[@class="slot-inner"]/h3')
    
        author = xpath.search(row, '//div[@class="slot-inner"]/p/strong')
    
        hour = xpath.search(row, '//div[@class="slot-inner"]/strong')
        
    if index%2 !=0:
        description = xpath.search(row, '/p')
        
        if talk is not None and author is not None and description is not None and hour is not None and len(talk)>0 and len(author)>0 and len(description)>0 and len(hour)>0:
            talk_pycones ={}
            talk_pycones['talk'] = talk[0].decode('utf-8').encode('cp850','replace').decode('cp850')
            talk_pycones['author'] = author[0].decode('utf-8').encode('cp850','replace').decode('cp850')