Exemple #1
0
 def cookies_parse(self, response):
     cookies = self._set_cookies(response)
     base_url = 'https://m.alaskaair.com/'
     src = response.xpath(
         '//div[@style="position:fixed; top:0; left:0; display:none"]/img/@src'
     ).extract_first()
     appid_url = parse.basejoin(base_url, src)
     headers = {
         'accept':
         "image/webp,image/apng,image/*,*/*;q=0.8",
         'accept-encoding':
         "gzip, deflate, br",
         'accept-language':
         "zh-CN,zh;q=0.9",
         'referer':
         "https://m.alaskaair.com/shopping/flights",
         'user-agent':
         "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36",
     }
     yield scrapy.Request(url=appid_url,
                          headers=headers,
                          cookies=cookies,
                          callback=self.pxvid_parse,
                          errback=self.errback,
                          meta={'res': response})
Exemple #2
0
def issue_to_pr(issuenum, srcbranch, repo='astropy', sourceuser='',
                targetuser='******', targetbranch='master',
                baseurl=GITHUB_API_BASE_URL):
    """
    Attaches code to an issue, converting a regular issue into a pull request.

    Parameters
    ----------
    issuenum : int
        The issue number (in `targetuser`/`repo`) onto which the code should be
        attached.
    srcbranch : str
        The branch (in `username`/`repo`) the from which to attach the code.
        After this is complete, further updates to this branch will be passed
        on to the pull request.
    repo : str
        The name of the repository (the same-name repo should be present for
        both `targetuser` and `username`).
    targetuser : str
        The name of the user/organization that has the issue.
    targetbranch : str
        The name of the branch (in `targetuser`/`repo`) that the pull request
        should merge into.
   baseurl : str
       The URL to use to access the github site (including protocol).

    .. warning::
        Be cautious supplying `pw` as a string - if you do this in an ipython
        session, for example, it will be logged in the input history, revealing
        your password in clear text.  When in doubt, leave it as `None`,
        as this will securely prompt you for your password.

    Returns
    -------
    response : str
        The json-decoded response from the github server.
    error message : str, optional
        If present, indicates github responded with an HTTP error.

    """

    while not sourceuser:
        sourceuser = raw_input('Enter GitHub username to create pull request '
                               'from: ').strip()

    username, password = get_credentials(username=sourceuser)

    data = {'issue': str(issuenum),
            'head': sourceuser + ':' + srcbranch,
            'base': targetbranch}

    datajson = json.dumps(data)

    suburl = '{user}/{repo}/pulls'.format(user=targetuser, repo=repo)
    url = basejoin(baseurl, suburl)
    res = requests.post(url, data=datajson, auth=(username, password))
    return res.json()
Exemple #3
0
 def redirect_internal(self, url, fp, errcode, errmsg, headers, data):
     if 'location' in headers:
         newurl = headers['location']
     elif 'uri' in headers:
         newurl = headers['uri']
     else:
         return
     void = fp.read()
     fp.close()
     # In case the server sent a relative URL, join with original:
     newurl = basejoin(self.type + ":" + url, newurl)
     return self.open(newurl)
Exemple #4
0
    def start_requests(self):
        while True:
            result = self.get_task(self.spider_name)
            airports, _date, _num = result[0].split(':')
            _from, _to = airports.split('-')

            _from = self.abc_cache.get(_from, _to)
            _to = self.abc_cache.get(_to, _to)
            _date = re.sub(r'(\d{4})(\d{2})(\d{2})', r'\1-\2-\3', _date)
            params = '{_from}/{_to}?from={_date}&adults=5&children&infants=0&preselect=true'.format(
                _date=_date, _from=_from, _to=_to)
            total_url = parse.basejoin(self.start_urls, params)

            yield scrapy.Request(total_url,
                                 meta={'origin': 1},
                                 errback=self.errback,
                                 callback=self.parse)
Exemple #5
0
class MNIST(Dataset):

    """
    Sets up an MNIST dataset.

    Attributes:
        raw_base_url (str): where to find the source data
        raw_train_input_gz (str): URL of the full path to raw train inputs
        raw_train_target_gz (str): URL of the full path to raw train targets
        raw_test_input_gz (str): URL of the full path to raw test inputs
        raw_test_target_gz (str): URL of the full path to raw test targets
        backend (neon.backends.Backend): backend used for this data
        inputs (dict): structure housing the loaded train/test/validation
                       input data
        targets (dict): structure housing the loaded train/test/validation
                        target data

    Keyword Args:
        repo_path (str, optional): where to locally host this dataset on disk
    """
    raw_base_url = 'http://yann.lecun.com/exdb/mnist/'
    raw_train_input_gz = basejoin(raw_base_url, 'train-images-idx3-ubyte.gz')
    raw_train_target_gz = basejoin(raw_base_url, 'train-labels-idx1-ubyte.gz')
    raw_test_input_gz = basejoin(raw_base_url, 't10k-images-idx3-ubyte.gz')
    raw_test_target_gz = basejoin(raw_base_url, 't10k-labels-idx1-ubyte.gz')

    def __init__(self, **kwargs):
        self.num_test_sample = 10000
        self.macro_batched = False
        self.__dict__.update(kwargs)

    def initialize(self):
        pass

    def read_image_file(self, fname, dtype=None):
        """
        Carries out the actual reading of MNIST image files.
        """
        with open(fname, 'rb') as f:
            magic, num_images, rows, cols = struct.unpack('>iiii', f.read(16))
            if magic != 2051:
                raise ValueError('invalid MNIST image file: ' + fname)
            full_image = np.fromfile(f, dtype='uint8').reshape((num_images,
                                                                rows * cols))

        if dtype is not None:
            dtype = np.dtype(dtype)
            full_image = full_image.astype(dtype)
            full_image /= 255.

        return full_image

    def read_label_file(self, fname):
        """
        Carries out the actual reading of MNIST label files.
        """
        with open(fname, 'rb') as f:
            magic, num_labels = struct.unpack('>ii', f.read(8))
            if magic != 2049:
                raise ValueError('invalid MNIST label file:' + fname)
            array = np.fromfile(f, dtype='uint8')
        return array

    def load(self, backend=None, experiment=None):
        if self.inputs['train'] is not None:
            return
        if 'repo_path' in self.__dict__:
            self.repo_path = os.path.expandvars(os.path.expanduser(
                self.repo_path))
            save_dir = os.path.join(self.repo_path,
                                    self.__class__.__name__)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)

            for url in (self.raw_train_input_gz, self.raw_train_target_gz,
                        self.raw_test_input_gz, self.raw_test_target_gz):
                name = os.path.basename(url).rstrip('.gz')
                repo_gz_file = os.path.join(save_dir, name + '.gz')
                repo_file = repo_gz_file.rstrip('.gz')
                if not os.path.exists(repo_file):
                    self.download_to_repo(url, save_dir)
                    with gzip.open(repo_gz_file, 'rb') as infile:
                        with open(repo_file, 'w') as outfile:
                            for line in infile:
                                outfile.write(line)
                logger.info('loading: %s', name)
                if 'images' in repo_file and 'train' in repo_file:
                    indat = self.read_image_file(repo_file, 'float32')
                    # flatten to 1D images
                    self.inputs['train'] = indat
                elif 'images' in repo_file and 't10k' in repo_file:
                    indat = self.read_image_file(repo_file, 'float32')
                    self.inputs['test'] = indat[0:self.num_test_sample]
                elif 'labels' in repo_file and 'train' in repo_file:
                    indat = self.read_label_file(repo_file)
                    # Prep a 1-hot label encoding
                    tmp = np.zeros((indat.shape[0], 10), dtype=np.float32)
                    for col in range(10):
                        tmp[:, col] = indat == col
                    self.targets['train'] = tmp
                elif 'labels' in repo_file and 't10k' in repo_file:
                    indat = self.read_label_file(
                        repo_file)[0:self.num_test_sample]
                    tmp = np.zeros((self.num_test_sample, 10),
                                   dtype=np.float32)
                    for col in range(10):
                        tmp[:, col] = indat == col
                    self.targets['test'] = tmp
                else:
                    logger.error('problems loading: %s', name)
            if 'sample_pct' in self.__dict__:
                self.sample_training_data()
            if hasattr(self, 'validation_pct'):
                self.split_set(
                    self.validation_pct, from_set='train', to_set='validation')
            self.format()
        else:
            raise AttributeError('repo_path not specified in config')
Exemple #6
0
class SPARSENET(Dataset):
    """
    Sets up a Sparsenet dataset.

    Attributes:
        raw_base_url (str): where to find the source data
        raw_train_input_gz (str): URL of the full path to raw train inputs
        raw_train_target_gz (str): URL of the full path to raw train targets
        raw_test_input_gz (str): URL of the full path to raw test inputs
        raw_test_target_gz (str): URL of the full path to raw test targets
        backend (neon.backends.Backend): backend used for this data
        inputs (dict): structure housing the loaded train/test/validation
                       input data
        targets (dict): structure housing the loaded train/test/validation
                        target data

    Keyword Args:
        repo_path (str, optional): where to locally host this dataset on disk
    """
    raw_base_url = 'http://redwood.berkeley.edu/bruno/sparsenet/'
    raw_train_whitened = basejoin(raw_base_url, 'IMAGES.mat')
    raw_train_unwhitened = basejoin(raw_base_url, 'IMAGES_RAW.mat')

    def __init__(self, **kwargs):
        self.macro_batched = False
        self.__dict__.update(kwargs)

    def read_image_file(self, fname, dtype=None):
        """
        Carries out the actual reading of Sparsenet image files.
        """
        logger.info("in read_image_file, reading: %s", fname)
        with open(fname, 'rb') as infile:
            array = pickle.load(infile)
            infile.close()
        return array

    def load(self, backend=None, experiment=None):
        """
        main function
        """
        import scipy.io
        if 'repo_path' in self.__dict__:
            self.repo_path = os.path.expandvars(
                os.path.expanduser(self.repo_path))
            save_dir = os.path.join(self.repo_path, self.__class__.__name__)
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            train_idcs = list(range(10000))
            if 'sample_pct' in self.__dict__:
                if self.sample_pct > 1.0:
                    self.sample_pct /= 100.0
                if self.sample_pct < 1.0:
                    numpy.random.seed(self.backend.rng_seed)
                    numpy.random.shuffle(train_idcs)
                train_idcs = train_idcs[0:int(10000 * self.sample_pct)]
            for url in (self.raw_train_unwhitened, self.raw_train_whitened):
                name = os.path.basename(url).rstrip('.mat')
                repo_mat_file = os.path.join(save_dir, name + '.mat')
                repo_file = repo_mat_file.rstrip('.mat')
                # download and create dataset
                if not os.path.exists(repo_file):
                    self.download_to_repo(url, save_dir)
                    infile = scipy.io.loadmat(repo_mat_file)
                    with open(repo_file, 'wb') as outfile:
                        data = infile[infile.keys()[0]]
                        # patches are extracted so they can be cached
                        # doing non-overlapping 16x16 patches (1024 per image)
                        patches = data.reshape(512 / 16, 16, 512 / 16, 16, 10)
                        patches = patches.transpose(1, 3, 0, 2, 4)
                        patches = patches.reshape(16, 16, 1024 * 10)
                        logger.info("Caching to pickle file: %s", outfile)
                        pickle.dump(patches, outfile)
                        outfile.close()
                logger.info('loading: %s', name)
                # load existing data
                if 'IMAGES' in repo_file:
                    indat = self.read_image_file(repo_file, 'float32')
                    # flatten to 1D images
                    indat = indat.reshape((256, 10240)).transpose()[train_idcs]
                    self.inputs['train'] = indat
                else:
                    logger.error('problems loading: %s', name)
            self.format()
        else:
            raise AttributeError('repo_path not specified in config')
Exemple #7
0
def issue_to_pr(issuenum,
                srcbranch,
                repo='astropy',
                sourceuser='',
                targetuser='******',
                targetbranch='master',
                baseurl=GITHUB_API_BASE_URL):
    """
    Attaches code to an issue, converting a regular issue into a pull request.

    Parameters
    ----------
    issuenum : int
        The issue number (in `targetuser`/`repo`) onto which the code should be
        attached.
    srcbranch : str
        The branch (in `username`/`repo`) the from which to attach the code.
        After this is complete, further updates to this branch will be passed
        on to the pull request.
    repo : str
        The name of the repository (the same-name repo should be present for
        both `targetuser` and `username`).
    targetuser : str
        The name of the user/organization that has the issue.
    targetbranch : str
        The name of the branch (in `targetuser`/`repo`) that the pull request
        should merge into.
   baseurl : str
       The URL to use to access the github site (including protocol).

    .. warning::
        Be cautious supplying `pw` as a string - if you do this in an ipython
        session, for example, it will be logged in the input history, revealing
        your password in clear text.  When in doubt, leave it as `None`,
        as this will securely prompt you for your password.

    Returns
    -------
    response : str
        The json-decoded response from the github server.
    error message : str, optional
        If present, indicates github responded with an HTTP error.

    """

    while not sourceuser:
        sourceuser = raw_input('Enter GitHub username to create pull request '
                               'from: ').strip()

    username, password = get_credentials(username=sourceuser)

    data = {
        'issue': str(issuenum),
        'head': sourceuser + ':' + srcbranch,
        'base': targetbranch
    }

    datajson = json.dumps(data)

    suburl = '{user}/{repo}/pulls'.format(user=targetuser, repo=repo)
    url = basejoin(baseurl, suburl)
    res = requests.post(url, data=datajson, auth=(username, password))
    return res.json()