def cookies_parse(self, response): cookies = self._set_cookies(response) base_url = 'https://m.alaskaair.com/' src = response.xpath( '//div[@style="position:fixed; top:0; left:0; display:none"]/img/@src' ).extract_first() appid_url = parse.basejoin(base_url, src) headers = { 'accept': "image/webp,image/apng,image/*,*/*;q=0.8", 'accept-encoding': "gzip, deflate, br", 'accept-language': "zh-CN,zh;q=0.9", 'referer': "https://m.alaskaair.com/shopping/flights", 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36", } yield scrapy.Request(url=appid_url, headers=headers, cookies=cookies, callback=self.pxvid_parse, errback=self.errback, meta={'res': response})
def issue_to_pr(issuenum, srcbranch, repo='astropy', sourceuser='', targetuser='******', targetbranch='master', baseurl=GITHUB_API_BASE_URL): """ Attaches code to an issue, converting a regular issue into a pull request. Parameters ---------- issuenum : int The issue number (in `targetuser`/`repo`) onto which the code should be attached. srcbranch : str The branch (in `username`/`repo`) the from which to attach the code. After this is complete, further updates to this branch will be passed on to the pull request. repo : str The name of the repository (the same-name repo should be present for both `targetuser` and `username`). targetuser : str The name of the user/organization that has the issue. targetbranch : str The name of the branch (in `targetuser`/`repo`) that the pull request should merge into. baseurl : str The URL to use to access the github site (including protocol). .. warning:: Be cautious supplying `pw` as a string - if you do this in an ipython session, for example, it will be logged in the input history, revealing your password in clear text. When in doubt, leave it as `None`, as this will securely prompt you for your password. Returns ------- response : str The json-decoded response from the github server. error message : str, optional If present, indicates github responded with an HTTP error. """ while not sourceuser: sourceuser = raw_input('Enter GitHub username to create pull request ' 'from: ').strip() username, password = get_credentials(username=sourceuser) data = {'issue': str(issuenum), 'head': sourceuser + ':' + srcbranch, 'base': targetbranch} datajson = json.dumps(data) suburl = '{user}/{repo}/pulls'.format(user=targetuser, repo=repo) url = basejoin(baseurl, suburl) res = requests.post(url, data=datajson, auth=(username, password)) return res.json()
def redirect_internal(self, url, fp, errcode, errmsg, headers, data): if 'location' in headers: newurl = headers['location'] elif 'uri' in headers: newurl = headers['uri'] else: return void = fp.read() fp.close() # In case the server sent a relative URL, join with original: newurl = basejoin(self.type + ":" + url, newurl) return self.open(newurl)
def start_requests(self): while True: result = self.get_task(self.spider_name) airports, _date, _num = result[0].split(':') _from, _to = airports.split('-') _from = self.abc_cache.get(_from, _to) _to = self.abc_cache.get(_to, _to) _date = re.sub(r'(\d{4})(\d{2})(\d{2})', r'\1-\2-\3', _date) params = '{_from}/{_to}?from={_date}&adults=5&children&infants=0&preselect=true'.format( _date=_date, _from=_from, _to=_to) total_url = parse.basejoin(self.start_urls, params) yield scrapy.Request(total_url, meta={'origin': 1}, errback=self.errback, callback=self.parse)
class MNIST(Dataset): """ Sets up an MNIST dataset. Attributes: raw_base_url (str): where to find the source data raw_train_input_gz (str): URL of the full path to raw train inputs raw_train_target_gz (str): URL of the full path to raw train targets raw_test_input_gz (str): URL of the full path to raw test inputs raw_test_target_gz (str): URL of the full path to raw test targets backend (neon.backends.Backend): backend used for this data inputs (dict): structure housing the loaded train/test/validation input data targets (dict): structure housing the loaded train/test/validation target data Keyword Args: repo_path (str, optional): where to locally host this dataset on disk """ raw_base_url = 'http://yann.lecun.com/exdb/mnist/' raw_train_input_gz = basejoin(raw_base_url, 'train-images-idx3-ubyte.gz') raw_train_target_gz = basejoin(raw_base_url, 'train-labels-idx1-ubyte.gz') raw_test_input_gz = basejoin(raw_base_url, 't10k-images-idx3-ubyte.gz') raw_test_target_gz = basejoin(raw_base_url, 't10k-labels-idx1-ubyte.gz') def __init__(self, **kwargs): self.num_test_sample = 10000 self.macro_batched = False self.__dict__.update(kwargs) def initialize(self): pass def read_image_file(self, fname, dtype=None): """ Carries out the actual reading of MNIST image files. """ with open(fname, 'rb') as f: magic, num_images, rows, cols = struct.unpack('>iiii', f.read(16)) if magic != 2051: raise ValueError('invalid MNIST image file: ' + fname) full_image = np.fromfile(f, dtype='uint8').reshape((num_images, rows * cols)) if dtype is not None: dtype = np.dtype(dtype) full_image = full_image.astype(dtype) full_image /= 255. return full_image def read_label_file(self, fname): """ Carries out the actual reading of MNIST label files. """ with open(fname, 'rb') as f: magic, num_labels = struct.unpack('>ii', f.read(8)) if magic != 2049: raise ValueError('invalid MNIST label file:' + fname) array = np.fromfile(f, dtype='uint8') return array def load(self, backend=None, experiment=None): if self.inputs['train'] is not None: return if 'repo_path' in self.__dict__: self.repo_path = os.path.expandvars(os.path.expanduser( self.repo_path)) save_dir = os.path.join(self.repo_path, self.__class__.__name__) if not os.path.exists(save_dir): os.makedirs(save_dir) for url in (self.raw_train_input_gz, self.raw_train_target_gz, self.raw_test_input_gz, self.raw_test_target_gz): name = os.path.basename(url).rstrip('.gz') repo_gz_file = os.path.join(save_dir, name + '.gz') repo_file = repo_gz_file.rstrip('.gz') if not os.path.exists(repo_file): self.download_to_repo(url, save_dir) with gzip.open(repo_gz_file, 'rb') as infile: with open(repo_file, 'w') as outfile: for line in infile: outfile.write(line) logger.info('loading: %s', name) if 'images' in repo_file and 'train' in repo_file: indat = self.read_image_file(repo_file, 'float32') # flatten to 1D images self.inputs['train'] = indat elif 'images' in repo_file and 't10k' in repo_file: indat = self.read_image_file(repo_file, 'float32') self.inputs['test'] = indat[0:self.num_test_sample] elif 'labels' in repo_file and 'train' in repo_file: indat = self.read_label_file(repo_file) # Prep a 1-hot label encoding tmp = np.zeros((indat.shape[0], 10), dtype=np.float32) for col in range(10): tmp[:, col] = indat == col self.targets['train'] = tmp elif 'labels' in repo_file and 't10k' in repo_file: indat = self.read_label_file( repo_file)[0:self.num_test_sample] tmp = np.zeros((self.num_test_sample, 10), dtype=np.float32) for col in range(10): tmp[:, col] = indat == col self.targets['test'] = tmp else: logger.error('problems loading: %s', name) if 'sample_pct' in self.__dict__: self.sample_training_data() if hasattr(self, 'validation_pct'): self.split_set( self.validation_pct, from_set='train', to_set='validation') self.format() else: raise AttributeError('repo_path not specified in config')
class SPARSENET(Dataset): """ Sets up a Sparsenet dataset. Attributes: raw_base_url (str): where to find the source data raw_train_input_gz (str): URL of the full path to raw train inputs raw_train_target_gz (str): URL of the full path to raw train targets raw_test_input_gz (str): URL of the full path to raw test inputs raw_test_target_gz (str): URL of the full path to raw test targets backend (neon.backends.Backend): backend used for this data inputs (dict): structure housing the loaded train/test/validation input data targets (dict): structure housing the loaded train/test/validation target data Keyword Args: repo_path (str, optional): where to locally host this dataset on disk """ raw_base_url = 'http://redwood.berkeley.edu/bruno/sparsenet/' raw_train_whitened = basejoin(raw_base_url, 'IMAGES.mat') raw_train_unwhitened = basejoin(raw_base_url, 'IMAGES_RAW.mat') def __init__(self, **kwargs): self.macro_batched = False self.__dict__.update(kwargs) def read_image_file(self, fname, dtype=None): """ Carries out the actual reading of Sparsenet image files. """ logger.info("in read_image_file, reading: %s", fname) with open(fname, 'rb') as infile: array = pickle.load(infile) infile.close() return array def load(self, backend=None, experiment=None): """ main function """ import scipy.io if 'repo_path' in self.__dict__: self.repo_path = os.path.expandvars( os.path.expanduser(self.repo_path)) save_dir = os.path.join(self.repo_path, self.__class__.__name__) if not os.path.exists(save_dir): os.makedirs(save_dir) train_idcs = list(range(10000)) if 'sample_pct' in self.__dict__: if self.sample_pct > 1.0: self.sample_pct /= 100.0 if self.sample_pct < 1.0: numpy.random.seed(self.backend.rng_seed) numpy.random.shuffle(train_idcs) train_idcs = train_idcs[0:int(10000 * self.sample_pct)] for url in (self.raw_train_unwhitened, self.raw_train_whitened): name = os.path.basename(url).rstrip('.mat') repo_mat_file = os.path.join(save_dir, name + '.mat') repo_file = repo_mat_file.rstrip('.mat') # download and create dataset if not os.path.exists(repo_file): self.download_to_repo(url, save_dir) infile = scipy.io.loadmat(repo_mat_file) with open(repo_file, 'wb') as outfile: data = infile[infile.keys()[0]] # patches are extracted so they can be cached # doing non-overlapping 16x16 patches (1024 per image) patches = data.reshape(512 / 16, 16, 512 / 16, 16, 10) patches = patches.transpose(1, 3, 0, 2, 4) patches = patches.reshape(16, 16, 1024 * 10) logger.info("Caching to pickle file: %s", outfile) pickle.dump(patches, outfile) outfile.close() logger.info('loading: %s', name) # load existing data if 'IMAGES' in repo_file: indat = self.read_image_file(repo_file, 'float32') # flatten to 1D images indat = indat.reshape((256, 10240)).transpose()[train_idcs] self.inputs['train'] = indat else: logger.error('problems loading: %s', name) self.format() else: raise AttributeError('repo_path not specified in config')
def issue_to_pr(issuenum, srcbranch, repo='astropy', sourceuser='', targetuser='******', targetbranch='master', baseurl=GITHUB_API_BASE_URL): """ Attaches code to an issue, converting a regular issue into a pull request. Parameters ---------- issuenum : int The issue number (in `targetuser`/`repo`) onto which the code should be attached. srcbranch : str The branch (in `username`/`repo`) the from which to attach the code. After this is complete, further updates to this branch will be passed on to the pull request. repo : str The name of the repository (the same-name repo should be present for both `targetuser` and `username`). targetuser : str The name of the user/organization that has the issue. targetbranch : str The name of the branch (in `targetuser`/`repo`) that the pull request should merge into. baseurl : str The URL to use to access the github site (including protocol). .. warning:: Be cautious supplying `pw` as a string - if you do this in an ipython session, for example, it will be logged in the input history, revealing your password in clear text. When in doubt, leave it as `None`, as this will securely prompt you for your password. Returns ------- response : str The json-decoded response from the github server. error message : str, optional If present, indicates github responded with an HTTP error. """ while not sourceuser: sourceuser = raw_input('Enter GitHub username to create pull request ' 'from: ').strip() username, password = get_credentials(username=sourceuser) data = { 'issue': str(issuenum), 'head': sourceuser + ':' + srcbranch, 'base': targetbranch } datajson = json.dumps(data) suburl = '{user}/{repo}/pulls'.format(user=targetuser, repo=repo) url = basejoin(baseurl, suburl) res = requests.post(url, data=datajson, auth=(username, password)) return res.json()