Esempio n. 1
0
File: main.py Progetto: niudd/yh
def run():
	while True:

		for url in urls_to_crawl:
			print url

			proxy = False

			print '1 requests...'
			response = make_request(url, ip_list, browser_list, proxy)
			if not response:
				print 'fail...\n--------------------------------------'
				proxy = True

			t = 2
			while not response:
				print '%d requests...' %t
				t += 1
				response = make_request(url, ip_list, browser_list, proxy)
			print 'requests succeed...\n--------------------------------------'


			print 'parsing page...'
			item = spider.parse(response)

			print 'collecting into database...'
			pipelines.process_item(item)

			print 'finish this page\n--------------------------------------'
def storePageAndHeader ( url, webPage, headerInfo, itemName, itemType ):
    """
    Adds reford for url, item, and urlToItem tables in database.
    Writes webPage, header, and tokens to files.
    """

    title, tokens = spider.parse( webPage, args.tokenizer )
    lowerTokens   = spider.lower( tokens )
    stemTokens    = spider.stem( lowerTokens, args.stemmer )

    reStr  = "content-type:(?P<ct>(.*))"
    result = re.search( reStr, headerInfo, re.IGNORECASE )

    if not title:
        title = ""

    urlId  = db.insertCachedUrl( url, result.group( "ct" ), title )
    itemId = db.insertItem( itemName, itemType )
    u2iId  = db.insertUrlToItem( urlId, itemId )

    fileNum = makeFileName( urlId )
    writeToFile( "data/raw/"    + fileNum + ".html", webPage )
    writeToFile( "data/header/" + fileNum + ".txt",  headerInfo )
    writeToFile( "data/clean/"  + fileNum + ".txt",  stemTokens, True )

    print( urlId )
def restoreClean ( ):
    """
    Opens every file in data/raw directory, parsing, lowercasing,
    and stemming. Stores results in data/clean directory.
    """

    for htmlFileName in os.listdir( 'data/raw' ):
        with open( 'data/raw/' + htmlFileName ) as htmlFile:
            webPage = htmlFile.read( )

            title, tokens = spider.parse( webPage, args.tokenizer )
            lowerTokens   = spider.lower( tokens )
            stemTokens    = spider.stem( lowerTokens, args.stemmer )

            cleanFileName = htmlFileName.strip( '.html' )
            writeToFile( "data/clean/"  + cleanFileName + ".txt",  stemTokens, True )

            print( int(cleanFileName) )
Esempio n. 4
0
def mine(settings,results):
	"""
	Converts string settings to its corresponding file
	and uses the data in it to construct a list of IDs to parse
	Then, sicks spider.py on that list
	And appends the results to string results' corresponding file
	"""
	assert type(settings) is str and type(results) is str
	settings = getSettings(settings)
	if not settings['debug'] and not settings['missing']:
		idList = getIDs(settings)
	else:
		idList =[]
		if settings['debug']:
			idList.append(getErrors(results))
		if settings['missing']:
			idList.append(getMissing(results))
	urls = constructURLs(settings,idList)
	for url in urls:
		data = spider.parse(url)
		appendData(data,results)
Esempio n. 5
0
                purchasing_person_name = re.findall("金额(.*?)<", content_xml,
                                                    re.M)[0]
            except:
                try:
                    purchasing_person_name = re.findall(
                        "中标价(.*?)中标", content, re.M)[0]
                except:
                    purchasing_person_name = ''

        return purchasing_person_name.replace(":", "").replace(":", "").strip()


if __name__ == '__main__':
    spider = MySpider()
    spider.proxy_enable = False
    spider.init_dedup()
    spider.init_downloader()
    # ------------ parse() ----------
    # print "开始登录"
    url = "http://www.fjggzyjy.cn/news/23439/"
    resp = spider.download(url)
    res = spider.parse(resp, url)

    # ------------ parse_detail_page() ----------
    # url = "http://www.bidcenter.com.cn/zbpage-4-%E6%B1%9F%E8%8B%8F-1.html"
    # resp = spider.download(url)
    res = spider.parse_detail_page(resp, url)
    for item in res:
        for k, v in item.iteritems():
            print k, v
Esempio n. 6
0
def get():
    isbnvalue = request.args.get('isbn')
    bookdict  = parse(isbnvalue)
    return jsonify({'isbn':bookdict})
Esempio n. 7
0
def bin(align_param_file, ref_ang_file, pref_image_in, pref_image_out, pref_sel, pref_sel_all,thres):
   # read in the alignment parameters and the reference angles
   # 1st column is psi, 2nd is theta, and 3rd is phi
   align = spider.parse(align_param_file)
   #align,header = format.read_alignment(align_param_file, ndarray=True)
   print("Reconstructing %d particles"%len(align))
   #assert(header[0]=='id')
   # read in reference angles
   refang = spider.parse(ref_ang_file)
   index = align[:, 0].astype(np.int)
   #refang, header = format.read_alignment(ref_ang_file, ndarray=True)
   #assert(header[0]=='id')
   # from degree to radian from column 1
   align[:,1:4] = np.deg2rad(align[:,1:4])
   refang[:,1:4] = np.deg2rad(refang[:,1:4])
   # read in pref of images
   iter_single_images = ndimage_file.iter_images(pref_image_in, index)
   # form unit directional vectors
   rphi = mcol(refang[:,3])
   rtheta = mcol(refang[:,2]) 
   unit_v = get_unitv(rphi,rtheta)
   
   # 2-array to track indeces of particles in the same angle bin 
   # Max number of particles in the same angle bin 
   MAX = 5000
   index = np.zeros((refang.shape[0],MAX))
   # array to track the number of particles in each bin
   quant = np.zeros((refang.shape[0]))
   # binning: loop through particles 
   for i, img in enumerate(iter_single_images):
      # direction of one particle
      phi = align[i,3]
      theta = align[i,2] 
      uv = get_unitv(phi,theta)
      # read in image
      #print i
      #img = ndimage_file.read_image(img)
      if theta > math.pi:
         img = get_mirror(img)
      ndimage_file.write_image(pref_image_out, img, i)
      # multiply with all ref ang and store the largest       
      ip = np.dot(unit_v,uv.T)
      # store the largest in the right bin
      bin = ip.argmax()
      index[bin,quant[bin]] = align[i,0]
      quant[bin] += 1
      #print index
      # adjust the psi angle
      rpsi = refang[bin,1]
      rtheta = refang[bin,2]
      rphi = refang[bin,3]  
      psi = adjust_psi(rpsi,rtheta,rphi,theta,phi)
      align[i,1] = psi     
   # loop through the bins and keep only those with more than 'thres' particles
   S = [] # will hold the selected bin numbers
   count = 0
   for j in range(refang.shape[0]):
      sz =  len(np.nonzero(index[j,:])[0])
      if sz > thres:
         table = index[j,0:sz]
         #print table
         filename = pref_sel + '{:05d}'.format(j)
         spider.write(filename,table)
         S.append(j)
   #print S
   spider.write(pref_sel_all,S)