Beispiel #1
Datei: Projekt: niudd/yh
def run():
	while True:

		for url in urls_to_crawl:
			print url

			proxy = False

			print '1 requests...'
			response = make_request(url, ip_list, browser_list, proxy)
			if not response:
				print 'fail...\n--------------------------------------'
				proxy = True

			t = 2
			while not response:
				print '%d requests...' %t
				t += 1
				response = make_request(url, ip_list, browser_list, proxy)
			print 'requests succeed...\n--------------------------------------'

			print 'parsing page...'
			item = spider.parse(response)

			print 'collecting into database...'

			print 'finish this page\n--------------------------------------'
def storePageAndHeader ( url, webPage, headerInfo, itemName, itemType ):
    Adds reford for url, item, and urlToItem tables in database.
    Writes webPage, header, and tokens to files.

    title, tokens = spider.parse( webPage, args.tokenizer )
    lowerTokens   = spider.lower( tokens )
    stemTokens    = spider.stem( lowerTokens, args.stemmer )

    reStr  = "content-type:(?P<ct>(.*))"
    result = reStr, headerInfo, re.IGNORECASE )

    if not title:
        title = ""

    urlId  = db.insertCachedUrl( url, "ct" ), title )
    itemId = db.insertItem( itemName, itemType )
    u2iId  = db.insertUrlToItem( urlId, itemId )

    fileNum = makeFileName( urlId )
    writeToFile( "data/raw/"    + fileNum + ".html", webPage )
    writeToFile( "data/header/" + fileNum + ".txt",  headerInfo )
    writeToFile( "data/clean/"  + fileNum + ".txt",  stemTokens, True )

    print( urlId )
def restoreClean ( ):
    Opens every file in data/raw directory, parsing, lowercasing,
    and stemming. Stores results in data/clean directory.

    for htmlFileName in os.listdir( 'data/raw' ):
        with open( 'data/raw/' + htmlFileName ) as htmlFile:
            webPage = )

            title, tokens = spider.parse( webPage, args.tokenizer )
            lowerTokens   = spider.lower( tokens )
            stemTokens    = spider.stem( lowerTokens, args.stemmer )

            cleanFileName = htmlFileName.strip( '.html' )
            writeToFile( "data/clean/"  + cleanFileName + ".txt",  stemTokens, True )

            print( int(cleanFileName) )
Beispiel #4
def mine(settings,results):
	Converts string settings to its corresponding file
	and uses the data in it to construct a list of IDs to parse
	Then, sicks on that list
	And appends the results to string results' corresponding file
	assert type(settings) is str and type(results) is str
	settings = getSettings(settings)
	if not settings['debug'] and not settings['missing']:
		idList = getIDs(settings)
		idList =[]
		if settings['debug']:
		if settings['missing']:
	urls = constructURLs(settings,idList)
	for url in urls:
		data = spider.parse(url)
Beispiel #5
                purchasing_person_name = re.findall("金额(.*?)<", content_xml,
                    purchasing_person_name = re.findall(
                        "中标价(.*?)中标", content, re.M)[0]
                    purchasing_person_name = ''

        return purchasing_person_name.replace(":", "").replace(":", "").strip()

if __name__ == '__main__':
    spider = MySpider()
    spider.proxy_enable = False
    # ------------ parse() ----------
    # print "开始登录"
    url = ""
    resp =
    res = spider.parse(resp, url)

    # ------------ parse_detail_page() ----------
    # url = ""
    # resp =
    res = spider.parse_detail_page(resp, url)
    for item in res:
        for k, v in item.iteritems():
            print k, v
Beispiel #6
def get():
    isbnvalue = request.args.get('isbn')
    bookdict  = parse(isbnvalue)
    return jsonify({'isbn':bookdict})
def bin(align_param_file, ref_ang_file, pref_image_in, pref_image_out, pref_sel, pref_sel_all,thres):
   # read in the alignment parameters and the reference angles
   # 1st column is psi, 2nd is theta, and 3rd is phi
   align = spider.parse(align_param_file)
   #align,header = format.read_alignment(align_param_file, ndarray=True)
   print("Reconstructing %d particles"%len(align))
   # read in reference angles
   refang = spider.parse(ref_ang_file)
   index = align[:, 0].astype(
   #refang, header = format.read_alignment(ref_ang_file, ndarray=True)
   # from degree to radian from column 1
   align[:,1:4] = np.deg2rad(align[:,1:4])
   refang[:,1:4] = np.deg2rad(refang[:,1:4])
   # read in pref of images
   iter_single_images = ndimage_file.iter_images(pref_image_in, index)
   # form unit directional vectors
   rphi = mcol(refang[:,3])
   rtheta = mcol(refang[:,2]) 
   unit_v = get_unitv(rphi,rtheta)
   # 2-array to track indeces of particles in the same angle bin 
   # Max number of particles in the same angle bin 
   MAX = 5000
   index = np.zeros((refang.shape[0],MAX))
   # array to track the number of particles in each bin
   quant = np.zeros((refang.shape[0]))
   # binning: loop through particles 
   for i, img in enumerate(iter_single_images):
      # direction of one particle
      phi = align[i,3]
      theta = align[i,2] 
      uv = get_unitv(phi,theta)
      # read in image
      #print i
      #img = ndimage_file.read_image(img)
      if theta > math.pi:
         img = get_mirror(img)
      ndimage_file.write_image(pref_image_out, img, i)
      # multiply with all ref ang and store the largest       
      ip =,uv.T)
      # store the largest in the right bin
      bin = ip.argmax()
      index[bin,quant[bin]] = align[i,0]
      quant[bin] += 1
      #print index
      # adjust the psi angle
      rpsi = refang[bin,1]
      rtheta = refang[bin,2]
      rphi = refang[bin,3]  
      psi = adjust_psi(rpsi,rtheta,rphi,theta,phi)
      align[i,1] = psi     
   # loop through the bins and keep only those with more than 'thres' particles
   S = [] # will hold the selected bin numbers
   count = 0
   for j in range(refang.shape[0]):
      sz =  len(np.nonzero(index[j,:])[0])
      if sz > thres:
         table = index[j,0:sz]
         #print table
         filename = pref_sel + '{:05d}'.format(j)
   #print S