def save(self, fn): start = datetime.datetime.now() r = BinReader(fn) # self.skipto(r, 'zhilian_144361202250012_1446732064') while True: fnpath = r.fn #todo offset = r.fd.tell() print "fn: %s, offset: %d" % (fnpath, offset) (n, v) = r.readone() if n is None: break print fn, n (getime, jdid) = self.parse_name(n) if getime is None or jdid is None: continue getime = int(getime) pp = self.get_pagestore() if pp.save(getime, jdid, self.get_url(jdid), v, fnpath, offset): global saved_count saved_count += 1 timeused = str(datetime.datetime.now() - start) if saved_count % 1000 == 1: print "====== saved:%d [%.2f%%][%s] =====" % ( saved_count, r.progress() * 100, timeused)
def save_bj_word(url, ch): m = re.search('^(.*):\/\/(.*)', url) channel = m.group(1) docid = m.group(2) print channel dburl = 'mongodb://*****:*****@localhost/' client = pymongo.MongoClient(dburl) c = client.admin['page_store_' + channel] page = c.find({'indexUrl': url}) count = 0 success = False for p in page: count += 1 print p['pageContentPath'] (ft, ofn, pos) = p['pageContentPath'].split('::') reader = BinReader(ofn) content = reader.readone_at(int(pos)) print content[0] print content[1] print datetime.datetime.fromtimestamp( long(p['crawlerUpdateTime']) / 1000) res = ch.parse(p['indexUrl'], content[1]) if res: print res['name'], docid success = True # print res['value'] else: print 'failed', docid print 'count:', count return success
def getRegNo(self, begin, end): rs_path = "/home/peiyuan/data/qichacha/sum/qichacha/39/qichacha.9139.bin" binReader = BinReader(rs_path) i = begin k = 0 while k< i: binReader.readone() k += 1 while i < end: line = binReader.readone() if line[0] == None: break i += 1 m = re.search(r"<li><label>注册号: </label>(.*?)</li>", line[1]) if m: regNo = m.group(1) print regNo if regNo.strip()!="": self.dsfile.write(regNo.strip()+"\n") self.dsfile.flush() else: self.fail_file.write(line[0]+" has no regno!\n") else: self.fail_file.write(line[0]+" has no regno!\n")
def handle_item(self, item): (ft, ofn, pos) = item['pageContentPath'].split('::') if self.bin_reader is None or self.bin_reader.fd.name != ofn: self.bin_reader = BinReader(ofn) item['content'] = self.bin_reader.readone_at(int(pos)) items = self.parse_item(item) for ii in items: self.process_child_item(ii)
def read_bin_file(): t = BinReader('nacao_captcha_image.bin') count = 0 while True: (a, b) = t.readone() if a is None or b is None: break count += 1 save_file("/home/windy/codeimg/nacao/"+ a +".jpeg", b) print count, a
def read_all(self): res = [] c = self.mongo.admin[self.source] bin_reader = None for i in c.find(): (ft, ofn, pos) = i['pageContentPath'].split('::') if bin_reader is None or bin_reader.fd.name != ofn: bin_reader = BinReader(ofn) i['content'] = bin_reader.readone_at(int(pos)) res.append(i) return res
def read_all(self): res = [] c = self.client[self.database][self.collection] bin_reader = None for i in c.find(): (ft, ofn, pos) = i['pageContentPath'].split('::') if bin_reader is None or bin_reader.fd.name != ofn: bin_reader = BinReader(ofn) i['content'] = bin_reader.readone_at(int(pos)) res.append(i) return res
def read_all(self): res = [] if len(self._docs) > 0: bin_reader = None for i in self._docs: doc = copy.deepcopy(i) (ft, ofn, pos) = doc['pageContentPath'].split('::') if bin_reader is None or bin_reader.fd.name != ofn: bin_reader = BinReader(ofn) doc['content'] = bin_reader.readone_at(int(pos)) res.append(doc) return res
def dispatch(self): for binname in self.bin_list: bs = BinReader("./jobdata/"+binname) #bs = BinReader("./data/"+binname) while True: (a,b) = bs.readone() if a is None: break job = {"id":a,"content":b} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True)
def dojob(self): self.init() reader = None for item in self._docs: (ft, ofn, pos) = item['pageContentPath'].split('::') if reader is None or reader.fd.name != ofn: reader = BinReader(ofn) content = reader.readone_at(int(pos)) if 'HTTP 错误 404.0' in content[1]: self.client[self.database][self.collection].delete_one({ 'indexUrl': item['indexUrl'], 'contentSign': item['contentSign'] })
class CWPParser(AbstractParser): """Consuming while Producing Parser""" def __init__(self, channel, name, db='admin', url='mongodb://*****:*****@localhost/'): AbstractParser.__init__(self, channel, name, db, url) self.bin_reader = None @abc.abstractmethod def parse_item(self, page): raise NotImplementedError('virtual function called') def init(self): return self.client[self.database][self.collection] def iter_results(self, res): return res.find() def handle_item(self, item): (ft, ofn, pos) = item['pageContentPath'].split('::') if self.bin_reader is None or self.bin_reader.fd.name != ofn: self.bin_reader = BinReader(ofn) item['content'] = self.bin_reader.readone_at(int(pos)) items = self.parse_item(item) for ii in items: self.process_child_item(ii) @abc.abstractmethod def process_child_item(self, item): pass
def read_bin_file(): t = BinReader('gsinfo_Guangdong_pic.bin') count = 0 while True: (a, b) = t.readone() if a is None or b is None: break count += 1 imgtype = imghdr.what(None, b) if imgtype in ['gif', 'jpeg', 'jpg', 'png', 'bmp']: spider.util.FS.dbg_save_file( "./captcha/" + spider.util.utf8str(a) + "." + imgtype, b) #print a, "save suceess..." else: print a, "验证码格式无效,可能内容已经损坏..." continue
def read_next(self, count=1000): res = [] if len(self._docs) > 0: bin_reader = None end = self.__current + count if end > len(self._docs): end = len(self._docs) for i in self._docs[self.__current:end]: doc = copy.deepcopy(i) (ft, ofn, pos) = doc['pageContentPath'].split('::') if bin_reader is None or bin_reader.fd.name != ofn: bin_reader = BinReader(ofn) doc['content'] = bin_reader.readone_at(int(pos)) res.append(doc) self.__current = end return res
def random_check(self, start=0, limit=10): binreader = BinReader(self.check_file) opts, args = getopt.getopt(sys.argv[1:], "n:") if len(opts) is 0 or opts[0][1] is "": randomindex = random.randint(start, start + limit) else: randomindex = int(opts[0][1]) line = binreader.readone() i = 1 while i < randomindex and line[0] is not None: line = binreader.readone() i += 1 if line[0] is None: print "None!!!" return f = open(line[0] + ".html", "w+b") f.write(line[1]) f.close()
def dispatch(self): for binname in self.bin_list: bin = BinReader("./jobdata/" + binname) while True: (a, b) = self.bs.readone() if a is None: break job = {"index": a, "html": b} self.add_job(job, True) self.wait_q_breakable() self.add_job(None, True)
def main(): try: opts, args = getopt.gnu_getopt(sys.argv[1:], 'o:m:pi:') except getopt.GetoptError as e: showusage() return 1 outfile = None matchstr = '' printout = False index = -1 for (n, v) in opts: if n == '-o': outfile = v if n == '-m': matchstr = v if n == '-p': printout = True if n == '-i': index = int(v) if len(args) == 0: showusage() return 1 if outfile: fo = BinSaver(outfile) for fn in args: r = BinReader(fn) while True: (n, v) = r.readone() if n is None: break if matchstr in v: fo.append(n, v) else: for fn in args: if printout or index != -1: r = BinReader(fn) else: r = BinReader1(fn) findex = 0 while True: (n, v) = r.readone() if n is None: break if index != -1: if findex == index: if printout: print v else: print n elif findex > index: break elif printout: print n, v else: print n findex += 1
def sum(self): filelist = self._get_file_path() totalcnt = 0 rsinfo = [] for i in range(len(filelist)): if os.path.isdir(dirpath + filelist[i]): continue binreader = BinReader(dirpath + filelist[i]) line = binreader.readone() endline = 10000000 cnt = 0 emptycnt = 0 skipcnt = 0 while line[0]: print "reading", filelist[i], ",line", cnt + 1 channel, cpid, gettime = line[0].split(".") if self.pagestore.check_should_fetch(cpid): if self.pagestore.save( int(gettime), cpid, "http://qichacha.com/firm_CN_" + cpid, line[1]): print filelist[i], "line", cnt + 1, "saved." else: print "skip", filelist[i], "line", cnt + 1 skipcnt += 1 if line[1] is None: emptycnt += 1 line = binreader.readone() cnt += 1 totalcnt += 1 if cnt >= endline: break infostr = "" + str(filelist[i]) + " total: " + str( cnt) + " empty: " + str(emptycnt) + " skip: " + str(skipcnt) print infostr rsinfo.append(infostr) for info in rsinfo: print info print len(filelist), "files, total:", totalcnt, "results" return totalcnt
def go(self): fns = ['fo.bin'] for fn in fns: fr = BinReader(fn) self.go_(fr)
if __name__ == '__main__': # delete_doc() url = 'abstract://2005100739298' channel = re.search('^(.*):\/\/(.*)', url).group(1) print channel store = CourtStore(channel) res = store.find_new(url) print res dburl = 'mongodb://*****:*****@localhost/' client = pymongo.MongoClient(dburl) c = client.zhuanli['page_store_' + channel] page = c.find({'indexUrl': url}) count = 0 for p in page: count += 1 print p['pageContentPath'] (ft, ofn, pos) = p['pageContentPath'].split('::') reader = BinReader(ofn) content = reader.readone_at(int(pos)) print content[0] print content[1] print p['indexUrl'] print p['realUrl'] print datetime.datetime.fromtimestamp( long(p['crawlerUpdateTime']) / 1000) # text=eval(content[1].replace('null','None')) # doc=text['FileContent'] # save_to_word_2(doc,url[11:]) print 'count:', count