Exemple #1
0
 def __init__(self,thcnt):
     Spider.__init__(self, thcnt)
     self.sessionReq = YouzyLogin()
     self.sessionReq.do_login()
     self.num_count = 0
     self.savefile=FileSaver("youzy.txt")
     self.__fail_urls = FileSaver("fail_urls.txt")
Exemple #2
0
 def __init__(self,thcnt):
     Spider.__init__(self, thcnt)
     self.sessionReq = YouzyLogin()
     self.sessionReq.do_login(1)
     self.num_count = 0
     self.parse_count = 0
     self.savefile=CsvSaver("spider_url_zhuanke_np.csv",fixed+pfcolumn)
     self.__fail_urls = FileSaver("spider_url_fail_zhuanke_np.txt")
Exemple #3
0
class YouzyNPSpider(Spider):

    def __init__(self,thcnt):
        Spider.__init__(self, thcnt)
        self.sessionReq = YouzyLogin()
        self.sessionReq.do_login(1)
        self.num_count = 0
        self.parse_count = 0
        self.savefile=CsvSaver("spider_url_zhuanke_np.csv",fixed+pfcolumn)
        self.__fail_urls = FileSaver("spider_url_fail_zhuanke_np.txt")

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty() or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count==0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        #f = open("spider_fail_np.txt", "r")
        f = open("spider_zhuanke_fail_np.txt", "r")
        while True :
            line = f.readline().strip()
            if line:
                job = {"url":line,"retry":0,"time":0}
                self.add_job(job, True)
            else:
                break
        f.close()
        self.wait_q_breakable()
        self.add_job(None, True)

    def get_fail_cnt(self, addv):
        fc = getattr(self._curltls,'failcount',0)
        if (addv):
            fc += addv
            setattr(self._curltls, 'failcount', fc)
        return fc


    def run_job(self, jobid):
        url = jobid.get("url")
        res = self.sessionReq.request_url(url)
        self.num_count += 1
        if res is None or res.code == 404:
            # retry = int(jobid.get("retry"))
            # if retry < 1:
            #     retry+=1
            #     jobid = {"url":url,"retry":retry}
            #     self.num_count -= 1
            #     print "url = [ %s ] retrying [ %d ]..."%(url,retry)
            #     self.add_job(jobid)
            # else:
                print "url= [%s] response %s "%(url,'is None' if res is None else "code is 404")
                self.__fail_urls.append(url)
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%s ------ %d " % (url,res.code)
            self.add_job(jobid)
            time.sleep(1)
        elif res.code == 200:
            if url.__contains__('codeId'):
                retry = int(jobid.get("retry"))
                self.parse(res.content,url,retry)
            else:
                doc = html.fromstring(res.content)
                values = doc.xpath('//select[@id="ddlCodes"]/option/@value')
                if values is not None and len(values) > 0 :
                    for value in values :
                        url2 = url+"&codeId="+value
                        jobid = {"url":url2,"retry":0}
                        self.add_job(jobid)
                else:
                    retry = int(jobid.get("retry"))
                    self.parse(res.content,url,retry)
        else:
            print "%s ########UNKNOWN ERROR###### [ %d ]" %(url, res.code)

    def parse(self,text,url,retry):
        doc = html.fromstring(text)
        fixed_line = []
        name = doc.xpath('//div[@class="box"]/h2')
        if len(name) < 1:
            # if retry < 3:
            #     retry += 1
            #     print "page content is none, retrying [%d] url=[%s]"%(retry,url)
            #     jobid = {"url":url,"retry":retry}
            #     self.num_count -= 1
            #     self.add_job(jobid)
            # else :
            print "page content is none,write failure file , url =[%s]"%url
            self.__fail_urls.append(url)
            return False
        fixed_line.append(name[0].text_content().encode('utf-8'))
        column_fixed = fixed+pfcolumn

        options = doc.xpath('//select[@id]/option[@selected]')
        for option in options:
            fixed_line.append(option.text_content().encode('utf-8'))

        headers = doc.xpath('//table[@class="table table-hover table-bordered table-th-gray"]/thead/tr/th')
        header_column= []
        for header in headers:
            header_column.append(header.text_content().encode('utf-8'))

        bodys = doc.xpath('//table[@class="table table-hover table-bordered table-th-gray"]/tbody')
        body_list = []
        for body in bodys:
            style = body.attrib.get('style', None)
            if not style:
                trs = body.xpath('tr')
                first = ''
                rowspan = 0
                fo = 0
                for tr in trs:
                    fo += 1
                    tds = tr.xpath('td')
                    if len(tds) == len(header_column):
                        first = tds[0].text_content().encode('utf-8')
                        rowspan = int(tds[0].attrib.get('rowspan'))
                        fo = 1
                        print "---first=%s,rowspan=%d"%(first,rowspan)
                    td_list = []
                    if fo <= rowspan and fo >1:
                        td_list.append(first)
                    for td in tds:
                        style = td.attrib.get('style', None)
                        if style:
                            continue
                        td_list.append(td.text_content().encode('utf-8'))
                    body_list.append(td_list)

        self.handle_line(body_list,header_column)
        for tr in body_list:
            line = fixed_line+tr
            self.savefile.writerline(line)
        self.parse_count += 1
        return True


    def handle_line(self,body_list,header_column):
        for tr in body_list:
            for h in pfcolumn:
                if not header_column.__contains__(h):
                    index = pfcolumn.index(h)
                    tr.insert(index,'null')

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg)
Exemple #4
0
class YouzySpiderPF(Spider):
    def __init__(self, thcnt):
        Spider.__init__(self, thcnt)
        self.sessionReq = YouzyLogin()
        self.sessionReq.do_login(5)
        self.savefile = CsvSaver("spider_zhuanke_pf.csv", fixed + pfcolumn)
        self.__fail_urls = FileSaver("spider_zhuanke_fail_pf.txt")
        self.id_count = 0
        self.url_count = 0
        self.parse_count = 0

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty(
            ) or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count == 0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        f = open("zhuanke.txt", "r")
        while True:
            line = f.readline().strip()
            if line:
                job = {"id": line, "retry": 0}
                self.add_job(job, True)
            else:
                break
        f.close()
        self.wait_q_breakable()
        self.add_job(None, True)

    def run_job(self, jobid):
        if self.id_count % 11 == 0:
            print "NOW =================================================================== \n [id_count=%d],[url_count=%d],[parse_count=%d]" % (
                self.id_count, self.url_count, self.parse_count)
        id = jobid.get("id")
        baseUrl = "http://www.youzy.cn/college/pfraction?courseType=-1&year=0&Id=" + id
        province = {
            "1", "842", "843", "851", "1128", "845", "834", "844", "848",
            "847", "855", "849", "850", "859", "837", "846", "839", "852",
            "860", "854", "840", "841", "1120", "857", "856", "862", "835"
        }
        self.id_count += 1
        for pro in province:
            url1 = baseUrl + "&provinceId=" + pro
            res = self.sessionReq.request_url(url1)
            self.url_count += 1
            if res is None:
                retry = int(jobid.get("retry"))
                if retry < 5:
                    retry += 1
                    jobid = {"id": id, "retry": retry}
                    self.id_count -= 1
                    print "id %s retrying %d..." % (id, retry)
                    self.add_job(jobid)
                else:
                    self.__fail_urls.append(url1)
            elif res.code == 200:
                doc = html.fromstring(res.content)
                values = doc.xpath('//select[@id="ddlCodes"]/option/@value')
                if values is not None and len(values) > 0:
                    for value in values:
                        url2 = url1 + "&codeId=" + value
                        self.req_local(url2, 0)
                else:
                    self.parse(res.content, url1)
            elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
                time.sleep(1)
                self.id_count -= 1
                self.add_job(jobid)
            elif res.code == 404:
                self.__fail_urls.append(url1)
                print "page [ %s ] is not found ! " % url1

    def parse(self, text, url):
        doc = html.fromstring(text)
        fixed_line = []
        name = doc.xpath('//div[@class="box"]/h2')
        if len(name) < 1:
            print "page content is none, url = %s" % url
            self.__fail_urls.append(url)
            return False
        fixed_line.append(name[0].text_content().encode('utf-8'))
        options = doc.xpath('//select[@id]/option[@selected]')
        for option in options:
            fixed_line.append(option.text_content().encode('utf-8'))
        headers = doc.xpath(
            '//table[@class="table table-hover table-bordered table-th-gray"]/thead/tr/th'
        )
        header_column = []
        for header in headers:
            style = header.attrib.get('style', None)
            if style:
                continue
            header_column.append(header.text_content().encode('utf-8'))

        bodys = doc.xpath(
            '//table[@class="table table-hover table-bordered table-th-gray"]/tbody'
        )
        body_list = []
        for body in bodys:
            style = body.attrib.get('style', None)
            if not style:
                trs = body.xpath('tr')
                for tr in trs:
                    tds = tr.xpath('td')
                    td_list = []
                    for td in tds:
                        style = td.attrib.get('style', None)
                        if style:
                            continue
                        td_list.append(td.text_content().encode('utf-8'))
                    body_list.append(td_list)

        self.handle_line(body_list, header_column)
        for tr in body_list:
            line = fixed_line + tr
            self.savefile.writerline(line)
        self.parse_count += 1
        return True

    def handle_line(self, body_list, header_column):
        for tr in body_list:
            for h in pfcolumn:
                if not header_column.__contains__(h):
                    index = pfcolumn.index(h)
                    tr.insert(index, 'null')

    def req_local(self, url, retry):
        res = self.sessionReq.request_url(url)
        self.url_count += 1
        if res is None:
            if retry < 5:
                retry += 1
                self.url_count -= 1
                self.req_local(url, retry)
            else:
                self.__fail_urls.append(url)
        elif res.code == 404:
            self.__fail_urls.append(url)
            print "%s ------ 404" % url
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%s ------ %d " % (url, res.code)
            time.sleep(1)
            self.url_count -= 1
            self.req_local(url, retry)
        elif res.code == 200:
            self.parse(res.content, url)
        else:
            print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code
            #raise AccountErrors.NoAccountError('fatal error')

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0],
                                 msg)
Exemple #5
0
class YouzySpider(Spider):

    def __init__(self,thcnt):
        Spider.__init__(self, thcnt)
        self.sessionReq = YouzyLogin()
        self.sessionReq.do_login()
        self.num_count = 0
        self.savefile=FileSaver("youzy.txt")
        self.__fail_urls = FileSaver("fail_urls.txt")

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty() or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count==0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        self.bs = BinSaver("youzy_job.bin")
        f = open("url_cfraction918-.txt", "r")
        while True :
            line = f.readline()
            if line.strip():
                job = {"url":line.strip()}
                self.add_job(job, True)
            else:
                break
        f.close()
        self.wait_q_breakable()
        self.add_job(None, True)

    def get_fail_cnt(self, addv):
        fc = getattr(self._curltls,'failcount',0)
        if (addv):
            fc += addv
            setattr(self._curltls, 'failcount', fc)
        return fc


    def run_job(self, jobid):
        url = jobid.get("url")
        print "url == ",url
        tid = self.get_tid()
        res = self.sessionReq.request_url(url)

        self.num_count += 1

        if res is None:
            if self.get_fail_cnt(1) < 10:
                self.add_job(jobid)
            else:
                self.__fail_urls.append(url)
                raise AccountErrors.NoAccountError("failcount = [ %d ]" % (self.get_fail_cnt(0)))
            return
        else:
            setattr(self._curltls,'failcount',0)

        if res.code == 404:
            print "%s ------ 404" % url
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%s ------ %d " % (url,res.code)
            self.add_job(jobid)
            time.sleep(0.8)
            return
        elif res.code == 200:
            print "%s ------ saving " % url
            self.parse(res.content)
            con = []
            type = {"key":con}
            str1 = json.dumps(type)
            self.savefile.append(str1)
            #print "content======\n",res.content
            #self.bs.append(fn, res.text)
        else:
            print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code
            #Log.error("unknown error...")
            #Log.errorbin("%s" %url, res.text)
            raise AccountErrors.NoAccountError('fatal error')


    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg)

    def parse(self,text):
        #doc = html.fromstring(get_doc("daxue.html"))
        #print "type=========",type(doc)
        doc = html.fromstring(text)
        part1_template ='{}\t\t\t\t\t{}\t\t\t\t\t{}\t\t\t\t\t{}'
        part1_list = []
        els = doc.xpath('//select[@id]/option[@selected]')
        for el in els:
            print el
            part1_list.append(el.text_content().encode('utf-8'))

        print part1_template.format(*part1_list)
        print "================================================================================="

        header_template = '{}\t\t\t\t\t{}\t\t\t\t\t{}\t\t\t\t\t{}\t\t\t\t\t{}'
        header_list = []

        headers = doc.xpath('//table[@class="table table-hover table-bordered table-th-gray"]/thead/tr/th')
        for header in headers:
            style = header.attrib.get('style', None)
            if style:
                continue

            header_list.append(header.text_content().encode('utf-8'))

        print header_template.format(*header_list)

        bodys = doc.xpath('//table[@class="table table-hover table-bordered table-th-gray"]/tbody')

        td_list = []

        for body in bodys:
            style = body.attrib.get('style', None)
            if not style:
                trs = body.xpath('tr')

                for tr in trs:
                    tds = tr.xpath('td')
                    for td in tds:
                        style = td.attrib.get('style', None)
                        if style:
                            continue
                        td_list.append(td.text_content().encode('utf-8'))
                    print header_template.format(*td_list)