def run(): while True: for url in urls_to_crawl: print url proxy = False print '1 requests...' response = make_request(url, ip_list, browser_list, proxy) if not response: print 'fail...\n--------------------------------------' proxy = True t = 2 while not response: print '%d requests...' %t t += 1 response = make_request(url, ip_list, browser_list, proxy) print 'requests succeed...\n--------------------------------------' print 'parsing page...' item = spider.parse(response) print 'collecting into database...' pipelines.process_item(item) print 'finish this page\n--------------------------------------'
def storePageAndHeader ( url, webPage, headerInfo, itemName, itemType ): """ Adds reford for url, item, and urlToItem tables in database. Writes webPage, header, and tokens to files. """ title, tokens = spider.parse( webPage, args.tokenizer ) lowerTokens = spider.lower( tokens ) stemTokens = spider.stem( lowerTokens, args.stemmer ) reStr = "content-type:(?P<ct>(.*))" result = re.search( reStr, headerInfo, re.IGNORECASE ) if not title: title = "" urlId = db.insertCachedUrl( url, result.group( "ct" ), title ) itemId = db.insertItem( itemName, itemType ) u2iId = db.insertUrlToItem( urlId, itemId ) fileNum = makeFileName( urlId ) writeToFile( "data/raw/" + fileNum + ".html", webPage ) writeToFile( "data/header/" + fileNum + ".txt", headerInfo ) writeToFile( "data/clean/" + fileNum + ".txt", stemTokens, True ) print( urlId )
def restoreClean ( ): """ Opens every file in data/raw directory, parsing, lowercasing, and stemming. Stores results in data/clean directory. """ for htmlFileName in os.listdir( 'data/raw' ): with open( 'data/raw/' + htmlFileName ) as htmlFile: webPage = htmlFile.read( ) title, tokens = spider.parse( webPage, args.tokenizer ) lowerTokens = spider.lower( tokens ) stemTokens = spider.stem( lowerTokens, args.stemmer ) cleanFileName = htmlFileName.strip( '.html' ) writeToFile( "data/clean/" + cleanFileName + ".txt", stemTokens, True ) print( int(cleanFileName) )
def mine(settings,results): """ Converts string settings to its corresponding file and uses the data in it to construct a list of IDs to parse Then, sicks spider.py on that list And appends the results to string results' corresponding file """ assert type(settings) is str and type(results) is str settings = getSettings(settings) if not settings['debug'] and not settings['missing']: idList = getIDs(settings) else: idList =[] if settings['debug']: idList.append(getErrors(results)) if settings['missing']: idList.append(getMissing(results)) urls = constructURLs(settings,idList) for url in urls: data = spider.parse(url) appendData(data,results)
purchasing_person_name = re.findall("金额(.*?)<", content_xml, re.M)[0] except: try: purchasing_person_name = re.findall( "中标价(.*?)中标", content, re.M)[0] except: purchasing_person_name = '' return purchasing_person_name.replace(":", "").replace(":", "").strip() if __name__ == '__main__': spider = MySpider() spider.proxy_enable = False spider.init_dedup() spider.init_downloader() # ------------ parse() ---------- # print "开始登录" url = "http://www.fjggzyjy.cn/news/23439/" resp = spider.download(url) res = spider.parse(resp, url) # ------------ parse_detail_page() ---------- # url = "http://www.bidcenter.com.cn/zbpage-4-%E6%B1%9F%E8%8B%8F-1.html" # resp = spider.download(url) res = spider.parse_detail_page(resp, url) for item in res: for k, v in item.iteritems(): print k, v
def get(): isbnvalue = request.args.get('isbn') bookdict = parse(isbnvalue) return jsonify({'isbn':bookdict})
def bin(align_param_file, ref_ang_file, pref_image_in, pref_image_out, pref_sel, pref_sel_all,thres): # read in the alignment parameters and the reference angles # 1st column is psi, 2nd is theta, and 3rd is phi align = spider.parse(align_param_file) #align,header = format.read_alignment(align_param_file, ndarray=True) print("Reconstructing %d particles"%len(align)) #assert(header[0]=='id') # read in reference angles refang = spider.parse(ref_ang_file) index = align[:, 0].astype(np.int) #refang, header = format.read_alignment(ref_ang_file, ndarray=True) #assert(header[0]=='id') # from degree to radian from column 1 align[:,1:4] = np.deg2rad(align[:,1:4]) refang[:,1:4] = np.deg2rad(refang[:,1:4]) # read in pref of images iter_single_images = ndimage_file.iter_images(pref_image_in, index) # form unit directional vectors rphi = mcol(refang[:,3]) rtheta = mcol(refang[:,2]) unit_v = get_unitv(rphi,rtheta) # 2-array to track indeces of particles in the same angle bin # Max number of particles in the same angle bin MAX = 5000 index = np.zeros((refang.shape[0],MAX)) # array to track the number of particles in each bin quant = np.zeros((refang.shape[0])) # binning: loop through particles for i, img in enumerate(iter_single_images): # direction of one particle phi = align[i,3] theta = align[i,2] uv = get_unitv(phi,theta) # read in image #print i #img = ndimage_file.read_image(img) if theta > math.pi: img = get_mirror(img) ndimage_file.write_image(pref_image_out, img, i) # multiply with all ref ang and store the largest ip = np.dot(unit_v,uv.T) # store the largest in the right bin bin = ip.argmax() index[bin,quant[bin]] = align[i,0] quant[bin] += 1 #print index # adjust the psi angle rpsi = refang[bin,1] rtheta = refang[bin,2] rphi = refang[bin,3] psi = adjust_psi(rpsi,rtheta,rphi,theta,phi) align[i,1] = psi # loop through the bins and keep only those with more than 'thres' particles S = [] # will hold the selected bin numbers count = 0 for j in range(refang.shape[0]): sz = len(np.nonzero(index[j,:])[0]) if sz > thres: table = index[j,0:sz] #print table filename = pref_sel + '{:05d}'.format(j) spider.write(filename,table) S.append(j) #print S spider.write(pref_sel_all,S)