Ejemplo n.º 1
0
    def save(self, fn):
        start = datetime.datetime.now()
        r = BinReader(fn)
        # self.skipto(r, 'zhilian_144361202250012_1446732064')
        while True:
            fnpath = r.fn  #todo
            offset = r.fd.tell()
            print "fn: %s, offset: %d" % (fnpath, offset)
            (n, v) = r.readone()
            if n is None:
                break
            print fn, n
            (getime, jdid) = self.parse_name(n)
            if getime is None or jdid is None:
                continue
            getime = int(getime)
            pp = self.get_pagestore()

            if pp.save(getime, jdid, self.get_url(jdid), v, fnpath, offset):
                global saved_count
                saved_count += 1
                timeused = str(datetime.datetime.now() - start)
                if saved_count % 1000 == 1:
                    print "====== saved:%d [%.2f%%][%s] =====" % (
                        saved_count, r.progress() * 100, timeused)
Ejemplo n.º 2
0
def save_bj_word(url, ch):
    m = re.search('^(.*):\/\/(.*)', url)
    channel = m.group(1)
    docid = m.group(2)
    print channel
    dburl = 'mongodb://*****:*****@localhost/'
    client = pymongo.MongoClient(dburl)
    c = client.admin['page_store_' + channel]
    page = c.find({'indexUrl': url})
    count = 0
    success = False
    for p in page:
        count += 1
        print p['pageContentPath']
        (ft, ofn, pos) = p['pageContentPath'].split('::')
        reader = BinReader(ofn)
        content = reader.readone_at(int(pos))
        print content[0]
        print content[1]
        print datetime.datetime.fromtimestamp(
            long(p['crawlerUpdateTime']) / 1000)
        res = ch.parse(p['indexUrl'], content[1])
        if res:
            print res['name'], docid
            success = True
            # print res['value']
        else:
            print 'failed', docid
    print 'count:', count
    return success
Ejemplo n.º 3
0
    def getRegNo(self, begin, end):
        rs_path = "/home/peiyuan/data/qichacha/sum/qichacha/39/qichacha.9139.bin"
        binReader = BinReader(rs_path)
        i = begin
        k = 0
        while k< i:
            binReader.readone()
            k += 1

        while i < end:
            line = binReader.readone()
            if line[0] == None:
                break
            i += 1
            m = re.search(r"<li><label>注册号:  </label>(.*?)</li>", line[1])
            if m:
                regNo = m.group(1)
                print regNo
                if regNo.strip()!="":
                    self.dsfile.write(regNo.strip()+"\n")
                    self.dsfile.flush()
                else:
                    self.fail_file.write(line[0]+" has no regno!\n")
            else:
                self.fail_file.write(line[0]+" has no regno!\n")
Ejemplo n.º 4
0
 def handle_item(self, item):
     (ft, ofn, pos) = item['pageContentPath'].split('::')
     if self.bin_reader is None or self.bin_reader.fd.name != ofn:
         self.bin_reader = BinReader(ofn)
     item['content'] = self.bin_reader.readone_at(int(pos))
     items = self.parse_item(item)
     for ii in items:
         self.process_child_item(ii)
Ejemplo n.º 5
0
def read_bin_file():
    t = BinReader('nacao_captcha_image.bin')
    count = 0
    while True:
        (a, b) = t.readone()
        if a is None or b is None:
            break
        count += 1
        save_file("/home/windy/codeimg/nacao/"+ a +".jpeg", b)
        print count, a
Ejemplo n.º 6
0
 def read_all(self):
     res = []
     c = self.mongo.admin[self.source]
     bin_reader = None
     for i in c.find():
         (ft, ofn, pos) = i['pageContentPath'].split('::')
         if bin_reader is None or bin_reader.fd.name != ofn:
             bin_reader = BinReader(ofn)
         i['content'] = bin_reader.readone_at(int(pos))
         res.append(i)
     return res
Ejemplo n.º 7
0
 def read_all(self):
     res = []
     c = self.client[self.database][self.collection]
     bin_reader = None
     for i in c.find():
         (ft, ofn, pos) = i['pageContentPath'].split('::')
         if bin_reader is None or bin_reader.fd.name != ofn:
             bin_reader = BinReader(ofn)
         i['content'] = bin_reader.readone_at(int(pos))
         res.append(i)
     return res
Ejemplo n.º 8
0
 def read_all(self):
     res = []
     if len(self._docs) > 0:
         bin_reader = None
         for i in self._docs:
             doc = copy.deepcopy(i)
             (ft, ofn, pos) = doc['pageContentPath'].split('::')
             if bin_reader is None or bin_reader.fd.name != ofn:
                 bin_reader = BinReader(ofn)
             doc['content'] = bin_reader.readone_at(int(pos))
             res.append(doc)
     return res
Ejemplo n.º 9
0
 def dispatch(self):
     for binname in self.bin_list:
         bs = BinReader("./jobdata/"+binname)
         #bs = BinReader("./data/"+binname)
         while True:
             (a,b) = bs.readone()
             if a is None:
                 break
             job = {"id":a,"content":b}
             self.add_job(job, True)
     self.wait_q_breakable()
     self.add_job(None, True)
Ejemplo n.º 10
0
 def dojob(self):
     self.init()
     reader = None
     for item in self._docs:
         (ft, ofn, pos) = item['pageContentPath'].split('::')
         if reader is None or reader.fd.name != ofn:
             reader = BinReader(ofn)
         content = reader.readone_at(int(pos))
         if 'HTTP 错误 404.0' in content[1]:
             self.client[self.database][self.collection].delete_one({
                 'indexUrl':
                 item['indexUrl'],
                 'contentSign':
                 item['contentSign']
             })
Ejemplo n.º 11
0
class CWPParser(AbstractParser):
    """Consuming while Producing Parser"""
    def __init__(self,
                 channel,
                 name,
                 db='admin',
                 url='mongodb://*****:*****@localhost/'):
        AbstractParser.__init__(self, channel, name, db, url)
        self.bin_reader = None

    @abc.abstractmethod
    def parse_item(self, page):
        raise NotImplementedError('virtual function called')

    def init(self):
        return self.client[self.database][self.collection]

    def iter_results(self, res):
        return res.find()

    def handle_item(self, item):
        (ft, ofn, pos) = item['pageContentPath'].split('::')
        if self.bin_reader is None or self.bin_reader.fd.name != ofn:
            self.bin_reader = BinReader(ofn)
        item['content'] = self.bin_reader.readone_at(int(pos))
        items = self.parse_item(item)
        for ii in items:
            self.process_child_item(ii)

    @abc.abstractmethod
    def process_child_item(self, item):
        pass
Ejemplo n.º 12
0
def read_bin_file():
    t = BinReader('gsinfo_Guangdong_pic.bin')
    count = 0
    while True:
        (a, b) = t.readone()
        if a is None or b is None:
            break
        count += 1
        imgtype = imghdr.what(None, b)
        if imgtype in ['gif', 'jpeg', 'jpg', 'png', 'bmp']:
            spider.util.FS.dbg_save_file(
                "./captcha/" + spider.util.utf8str(a) + "." + imgtype, b)
            #print a, "save suceess..."
        else:
            print a, "验证码格式无效,可能内容已经损坏..."
            continue
Ejemplo n.º 13
0
 def read_next(self, count=1000):
     res = []
     if len(self._docs) > 0:
         bin_reader = None
         end = self.__current + count
         if end > len(self._docs):
             end = len(self._docs)
         for i in self._docs[self.__current:end]:
             doc = copy.deepcopy(i)
             (ft, ofn, pos) = doc['pageContentPath'].split('::')
             if bin_reader is None or bin_reader.fd.name != ofn:
                 bin_reader = BinReader(ofn)
             doc['content'] = bin_reader.readone_at(int(pos))
             res.append(doc)
         self.__current = end
     return res
Ejemplo n.º 14
0
 def random_check(self, start=0, limit=10):
     binreader = BinReader(self.check_file)
     opts, args = getopt.getopt(sys.argv[1:], "n:")
     if len(opts) is 0 or opts[0][1] is "":
         randomindex = random.randint(start, start + limit)
     else:
         randomindex = int(opts[0][1])
     line = binreader.readone()
     i = 1
     while i < randomindex and line[0] is not None:
         line = binreader.readone()
         i += 1
     if line[0] is None:
         print "None!!!"
         return
     f = open(line[0] + ".html", "w+b")
     f.write(line[1])
     f.close()
Ejemplo n.º 15
0
 def dispatch(self):
     for binname in self.bin_list:
         bin = BinReader("./jobdata/" + binname)
         while True:
             (a, b) = self.bs.readone()
             if a is None:
                 break
             job = {"index": a, "html": b}
             self.add_job(job, True)
     self.wait_q_breakable()
     self.add_job(None, True)
Ejemplo n.º 16
0
def main():
    try:
        opts, args = getopt.gnu_getopt(sys.argv[1:], 'o:m:pi:')
    except getopt.GetoptError as e:
        showusage()
        return 1

    outfile = None
    matchstr = ''
    printout = False
    index = -1
    for (n, v) in opts:
        if n == '-o':
            outfile = v
        if n == '-m':
            matchstr = v
        if n == '-p':
            printout = True
        if n == '-i':
            index = int(v)

    if len(args) == 0:
        showusage()
        return 1

    if outfile:
        fo = BinSaver(outfile)
        for fn in args:
            r = BinReader(fn)
            while True:
                (n, v) = r.readone()
                if n is None:
                    break
                if matchstr in v:
                    fo.append(n, v)
    else:
        for fn in args:
            if printout or index != -1:
                r = BinReader(fn)
            else:
                r = BinReader1(fn)
            findex = 0
            while True:
                (n, v) = r.readone()
                if n is None:
                    break
                if index != -1:
                    if findex == index:
                        if printout:
                            print v
                        else:
                            print n
                    elif findex > index:
                        break
                elif printout:
                    print n, v
                else:
                    print n
                findex += 1
Ejemplo n.º 17
0
 def sum(self):
     filelist = self._get_file_path()
     totalcnt = 0
     rsinfo = []
     for i in range(len(filelist)):
         if os.path.isdir(dirpath + filelist[i]): continue
         binreader = BinReader(dirpath + filelist[i])
         line = binreader.readone()
         endline = 10000000
         cnt = 0
         emptycnt = 0
         skipcnt = 0
         while line[0]:
             print "reading", filelist[i], ",line", cnt + 1
             channel, cpid, gettime = line[0].split(".")
             if self.pagestore.check_should_fetch(cpid):
                 if self.pagestore.save(
                         int(gettime), cpid,
                         "http://qichacha.com/firm_CN_" + cpid, line[1]):
                     print filelist[i], "line", cnt + 1, "saved."
             else:
                 print "skip", filelist[i], "line", cnt + 1
                 skipcnt += 1
             if line[1] is None:
                 emptycnt += 1
             line = binreader.readone()
             cnt += 1
             totalcnt += 1
             if cnt >= endline: break
         infostr = "" + str(filelist[i]) + " total: " + str(
             cnt) + " empty: " + str(emptycnt) + " skip: " + str(skipcnt)
         print infostr
         rsinfo.append(infostr)
     for info in rsinfo:
         print info
     print len(filelist), "files, total:", totalcnt, "results"
     return totalcnt
Ejemplo n.º 18
0
 def go(self):
     fns = ['fo.bin']
     for fn in fns:
         fr = BinReader(fn)
         self.go_(fr)
Ejemplo n.º 19
0
if __name__ == '__main__':
    # delete_doc()
    url = 'abstract://2005100739298'
    channel = re.search('^(.*):\/\/(.*)', url).group(1)
    print channel
    store = CourtStore(channel)
    res = store.find_new(url)
    print res
    dburl = 'mongodb://*****:*****@localhost/'
    client = pymongo.MongoClient(dburl)
    c = client.zhuanli['page_store_' + channel]
    page = c.find({'indexUrl': url})
    count = 0
    for p in page:
        count += 1
        print p['pageContentPath']
        (ft, ofn, pos) = p['pageContentPath'].split('::')
        reader = BinReader(ofn)
        content = reader.readone_at(int(pos))
        print content[0]
        print content[1]
        print p['indexUrl']
        print p['realUrl']
        print datetime.datetime.fromtimestamp(
            long(p['crawlerUpdateTime']) / 1000)
        # text=eval(content[1].replace('null','None'))
        # doc=text['FileContent']
        # save_to_word_2(doc,url[11:])
    print 'count:', count