def __iter__(self): filepath = download(self._url, self._cache_dir) with tarfile.open(filepath) as archive: for filename in archive.getnames(): if filename.startswith('aclImdb/train/pos/'): yield self._read(archive, filename), True elif filename.startswith('aclImdb/train/neg/'): yield self._read(archive, filename), False
def test_reference_file_readonly(filename, md5, ref_sum, ref_num_cells): filename = helpers.download(filename, md5) mesh = meshio.read(filename) tol = 1.0e-2 s = mesh.points.sum() assert abs(s - ref_sum) < tol * ref_sum assert {k: len(v) for k, v in mesh.cells.items()} == ref_num_cells assert { k: len(v["gmsh:physical"]) for k, v in mesh.cell_data.items() } == ref_num_cells
def test_reference_file(filename, md5, ref_sum, ref_num_cells, write_binary): filename = helpers.download(filename, md5) mesh = meshio.read(filename) tol = 1.0e-2 s = numpy.sum(mesh.points) assert abs(s - ref_sum) < tol * ref_sum assert len(mesh.cells["triangle"]) == ref_num_cells writer = partial(meshio.vtk_io.write, write_binary=write_binary) helpers.write_read(writer, meshio.vtk_io.read, mesh, 1.0e-15) return
def test_reference_file(filename, md5, ref_sum, ref_num_cells, write_binary): filename = helpers.download(filename, md5) mesh = meshio.read(filename) tol = 1.0e-2 s = mesh.points.sum() assert abs(s - ref_sum) < tol * ref_sum assert {k: len(v) for k, v in mesh.cells.items()} == ref_num_cells assert { k: len(v["gmsh:physical"]) for k, v in mesh.cell_data.items() } == ref_num_cells writer = partial(meshio.msh_io.write, fmt_version="2", write_binary=write_binary) helpers.write_read(writer, meshio.msh_io.read, mesh, 1.0e-15)
def _read_pages(self, url): """ Extract plain words from a Wikipedia dump and store them to the pages file. Each page will be a line with words separated by spaces. """ wikipedia_path = download(url, self._cache_dir) with bz2.open(wikipedia_path) as wikipedia, \ bz2.open(self._pages_path, 'wt') as pages: for _, element in etree.iterparse(wikipedia, tag='{*}page'): if element.find('./{*}redirect') is not None: continue page = element.findtext('./{*}revision/{*}text') words = self._tokenize(page) pages.write(' '.join(words) + '\n') element.clear()
def test_reference_file_with_mixed_cells(): filename = "med/cylinder.med" md5 = "e36b365542c72ef470b83fc21f4dad58" filename = helpers.download(filename, md5) mesh = meshio.read(filename) # Points assert numpy.isclose(mesh.points.sum(), 16.53169892762988) # Cells ref_num_cells = {"pyramid": 18, "quad": 18, "line": 17, "tetra": 63, "triangle": 4} assert {k: len(v) for k, v in mesh.cells.items()} == ref_num_cells # Point tags assert mesh.point_data["point_tags"].sum() == 52 ref_point_tags_info = {2: ["Side"], 3: ["Side", "Top"], 4: ["Top"]} assert mesh.point_tags == ref_point_tags_info # Cell tags ref_sum_cell_tags = { "pyramid": -116, "quad": -75, "line": -48, "tetra": -24, "triangle": -30, } assert { k: v["cell_tags"].sum() for k, v in mesh.cell_data.items() } == ref_sum_cell_tags ref_cell_tags_info = { -6: ["Top circle"], -7: ["Top", "Top and down"], -8: ["Top and down"], -9: ["A", "B"], -10: ["B"], -11: ["B", "C"], -12: ["C"], } assert mesh.cell_tags == ref_cell_tags_info helpers.write_read(meshio.med_io.write, meshio.med_io.read, mesh, 1.0e-15)
def test_reference_file_with_point_cell_data(): filename = "med/box.med" md5 = "0867fb11bd14b83ad11ab20e2b1fd57d" filename = helpers.download(filename, md5) mesh = meshio.read(filename) # Points assert numpy.isclose(mesh.points.sum(), 12) # Cells assert {k: len(v) for k, v in mesh.cells.items()} == {"hexahedron": 1} # Point data data_u = mesh.point_data["resu____DEPL"] assert data_u.shape == (8, 3) assert numpy.isclose(data_u.sum(), 12) # Cell data # ELNO (1 data point for every node of each element) data_eps = mesh.cell_data["hexahedron"]["resu____EPSI_ELNO"] assert data_eps.shape == (1, 8, 6) # (n_cells, n_nodes_per_element, n_components) data_eps_mean = numpy.mean(data_eps, axis=1)[0] eps_ref = numpy.array([1, 0, 0, 0.5, 0.5, 0]) assert numpy.allclose(data_eps_mean, eps_ref) data_sig = mesh.cell_data["hexahedron"]["resu____SIEF_ELNO"] assert data_sig.shape == (1, 8, 6) # (n_cells, n_nodes_per_element, n_components) data_sig_mean = numpy.mean(data_sig, axis=1)[0] sig_ref = numpy.array( [7328.44611253, 2645.87030114, 2034.06063679, 1202.6, 569.752, 0] ) assert numpy.allclose(data_sig_mean, sig_ref) data_psi = mesh.cell_data["hexahedron"]["resu____ENEL_ELNO"] assert data_psi.shape == (1, 8, 1) # (n_cells, n_nodes_per_element, n_components) # ELEM (1 data point for each element) data_psi_elem = mesh.cell_data["hexahedron"]["resu____ENEL_ELEM"] assert numpy.isclose(numpy.mean(data_psi, axis=1)[0, 0], data_psi_elem[0]) helpers.write_read(meshio.med_io.write, meshio.med_io.read, mesh, 1.0e-15)
def test_reference_file_with_point_cell_data(): filename = "med/box.med" md5 = "0867fb11bd14b83ad11ab20e2b1fd57d" filename = helpers.download(filename, md5) mesh = meshio.read(filename) # Points assert numpy.isclose(mesh.points.sum(), 12) # Cells assert {k: len(v) for k, v in mesh.cells.items()} == {"hexahedron": 1} # Point data data_u = mesh.point_data["resu____DEPL"] assert data_u.shape == (8, 3) assert numpy.isclose(data_u.sum(), 12) # Cell data # ELNO (1 data point for every node of each element) data_eps = mesh.cell_data["hexahedron"]["resu____EPSI_ELNO"] assert data_eps.shape == (1, 8, 6) # (n_cells, n_nodes_per_element, n_components) data_eps_mean = numpy.mean(data_eps, axis=1)[0] eps_ref = numpy.array([1, 0, 0, 0.5, 0.5, 0]) assert numpy.allclose(data_eps_mean, eps_ref) data_sig = mesh.cell_data["hexahedron"]["resu____SIEF_ELNO"] assert data_sig.shape == (1, 8, 6) # (n_cells, n_nodes_per_element, n_components) data_sig_mean = numpy.mean(data_sig, axis=1)[0] sig_ref = numpy.array( [7328.44611253, 2645.87030114, 2034.06063679, 1202.6, 569.752, 0] ) assert numpy.allclose(data_sig_mean, sig_ref) data_psi = mesh.cell_data["hexahedron"]["resu____ENEL_ELNO"] assert data_psi.shape == (1, 8) # (n_cells, n_nodes_per_element, ) with 1 cut off # ELEM (1 data point for each element) data_psi_elem = mesh.cell_data["hexahedron"]["resu____ENEL_ELEM"] assert numpy.isclose(numpy.mean(data_psi, axis=1)[0], data_psi_elem[0]) helpers.write_read(meshio.med_io.write, meshio.med_io.read, mesh, 1.0e-15)
def getData(): r = download('https://adventofcode.com/2018/day/3/input') data = [] regex = re.compile(r'^#(\d+) @ (\d+),(\d+): (\d+)x(\d+)$') # The wording of the problem says the size is "at least 1000x1000" # My input is exactly 1000x1000, but let's play it safe and work it out anyway max_x = 0 max_y = 0 for line in [x.decode() for x in r.iter_lines()]: linematch = regex.match(line) _, left, top, width, height = map(int, linematch.groups()) # Process each line into a list of (x,y) tuples that its rectangle covers coords = [] for x in range(left, left + width): for y in range(top, top + height): coords.append((x, y)) max_x = max(max_x, left + width) max_y = max(max_y, top + height) data.append(coords) print('Grid size is {}x{}'.format(max_x, max_y)) return data, (max_x, max_y)
def getData(): r = download('https://adventofcode.com/2018/day/12/input') iterlines = r.iter_lines() # Prepend and append some padding to allow the 5-wide window tests initialState = padding + [x == b'#'[0] for x in iterlines.__next__()[15:]] + padding # Skip the empty line in the input _ = iterlines.__next__() rules = [] for line in iterlines: # Only store the rules which grow plants if line[9] == b'#'[0]: rule = 0 for x in line[0:5]: rule = rule << 1 rule += x == b'#'[0] rules.append(rule) return np.array(initialState, dtype=np.bool), np.array(rules, dtype=np.uint8)
def getData(): r = download('https://adventofcode.com/2018/day/7/input') regex = re.compile( r'^Step ([A-Z]) must be finished before step ([A-Z]) can begin\.$') data = {} available = [] for line in r.iter_lines(): blocker, blockee = regex.match(line.decode()).groups() if blocker not in data: data[blocker] = task(blocker) available.append(data[blocker]) if blockee not in data: data[blockee] = task(blockee) data[blocker].blocks.add(data[blockee]) data[blockee].blockedby.add(data[blocker]) if data[blockee] in available: available.remove(data[blockee]) # Sort the available tasks, by their name, backwards so that .pop() gets the next one available.sort(reverse=True) return data, available
def getData(): r = download('https://adventofcode.com/2018/day/13/input') track = [] carts = [] for y, line in enumerate(r.iter_lines()): track.append([]) for x, char in enumerate(line.decode()): if char in ['^', 'v']: track[-1].append('|') carts.append((x, y, 0, 1 if char == '^' else -1, 0)) elif char in ['>', '<']: track[-1].append('-') carts.append((x, y, 1 if char == '>' else -1, 0, 0)) else: track[-1].append(char) carts = np.array(carts, dtype={ 'names': ['x', 'y', 'dirx', 'diry', 'rot'], 'formats': [np.int16 for _ in range(5)] }) return np.array(track), carts
def getData(): r = download('https://adventofcode.com/2018/day/23/input') regex = re.compile(r'^pos=<(-?\d+),(-?\d+),(-?\d+)>, r=(\d+)$') data = np.array(list(map(lambda x: tuple(map(int, regex.match(x.decode()).groups())), r.iter_lines())), dtype={'names': ['x', 'y', 'z', 'r'], 'formats': [np.int32, np.int32, np.int32, np.uint32]}) return data
def getData(): r = download('https://adventofcode.com/2018/day/9/input') regex = re.compile(r'^(\d+) players; last marble is worth (\d+) points$') return tuple(map(int, regex.match(r.text.strip()).groups()))
def add_from(request, prefix_lower, prefix_normal, multi=False): url = request.form.get(f'{prefix_lower}-art-url') if url == None and request.get_json() != None: url = request.get_json()[f'{prefix_lower}-art-url'] if url != '' and url != None: if prefix_lower == 'deviantart': art = scrapers.deviant_art(url) elif prefix_lower == 'artstation': if multi: art = scrapers.art_station(url, True) else: art = scrapers.art_station(url) elif prefix_lower == 'pixiv': art = scrapers.pixiv(url, load_pickle().get('pixiv_username'), load_pickle().get('pixiv_password')) elif prefix_lower == 'tumblr': art = scrapers.tumblr(url) elif prefix_lower == 'instagram': art = scrapers.instagram(url) elif prefix_lower == 'reddit': art = scrapers.reddit(url) elif prefix_lower == 'twitter': art = scrapers.twitter(url) title = art['title'] if multi: images = [] for image_url in art['image_url']: images.append(helpers.download(image_url, UPLOAD_FOLDER)) images = ','.join(images) image = images else: if prefix_lower == 'pixiv': image = helpers.download(art['image_url'], UPLOAD_FOLDER, art['source']) else: image = helpers.download(art['image_url'], UPLOAD_FOLDER) source = art['source'] artist_name = art['artist_name'] artist_website = art['artist_website'] g.db = connect_db() if request.form.get('existing-artist'): artist_id = request.form.get('artist-id') else: artist = g.db.execute('SELECT id FROM artist WHERE website=?', [artist_website]).fetchone() if artist != None: artist_id = artist[0] else: cursor = g.db.execute('INSERT into artist(name, website) VALUES(?,?)', (artist_name, artist_website)) artist_id = cursor.lastrowid cursor = g.db.execute('INSERT into art(title, image_url, artist_id, source) VALUES(?,?,?,?)', (title, image, artist_id, source)) inserted_row_id = cursor.lastrowid g.db.commit() g.db.close() if helpers.request_wants_json(): return jsonify(status='success', message='Art added', id=inserted_row_id) else: flash('Art added', 'success') else: if helpers.request_wants_json(): return jsonify(status='error', message=f'{prefix_normal} Image url was empty') else: flash(f'{prefix_normal} Image url was empty', 'error') return redirect('/add') if not helpers.request_wants_json(): return redirect('/art/' + str(inserted_row_id))
def edit_art(): id = request.form.get('id') title = request.form.get('title') images_from_files = request.files.getlist('image-from-file') images_from_urls = request.form.getlist('image-from-url') existing_images = request.form.getlist('existing-image') images = [] for local_image in request.form.getlist('local-image'): if local_image == 'true': file = images_from_files[0] images_from_files.pop(0) if file.filename == '': flash('No file selected', 'error') return redirect('/') if file and helpers.allowed_file(file.filename, ALLOWED_EXTENSIONS): filename = secure_filename(file.filename) filename = helpers.prepend_date_time_to_string(filename) file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) image = filename images.append(image) else: flash('File extension not allowed', 'error') return redirect('/') elif local_image == 'false': image = images_from_urls[0] images_from_urls.pop(0) if image != '': image = helpers.download(image, UPLOAD_FOLDER) images.append(image) else: flash('Image url was empty', 'error') return redirect('/') elif local_image == 'existing': image = existing_images[0] existing_images.pop(0) images.append(image) images_string = ','.join(images) g.db = connect_db() if request.form.get('existing-artist') == 'true': artist_id = request.form.get('artist-id') else: artist_name = request.form.get('artist-name') artist_website = request.form.get('artist-website') cursor = g.db.execute('INSERT into artist(name, website) VALUES(?,?)', (artist_name, artist_website)) artist_id = cursor.lastrowid source = request.form.get('source') images_before_edit = g.db.execute('SELECT image_url FROM art WHERE id=?', [id]).fetchone()[0] images_before_edit = images_before_edit.split(',') images_that_are_no_longer_in_use = list(set(images_before_edit) - set(images)) for image_that_is_no_longer_in_use in images_that_are_no_longer_in_use: try: os.remove(os.path.join(UPLOAD_FOLDER, image_that_is_no_longer_in_use)) except Exception as e: # print(e) pass g.db.execute('UPDATE art SET title=?, image_url=?, artist_id=?, source=?, updated_at=CURRENT_TIMESTAMP WHERE id=?', (title, images_string, artist_id, source, id)) tags = request.form.getlist('tags') if tags != []: g.db.execute('DELETE FROM art_tag WHERE art_id=?', [id]) for tag in tags: g.db.execute('INSERT into art_tag(art_id, tag_id) VALUES(?, ?)', (id, tag)) g.db.commit() g.db.close() flash('Art updated', 'success') return redirect('/art/' + id)
def getData(): r = download('https://adventofcode.com/2018/day/18/input') return np.array([list(x.decode()) for x in r.iter_lines()])
def getData(): r = download('https://adventofcode.com/2018/day/21/input') iterator = r.iter_lines() iprLine = iterator.__next__().decode().split(' ') instructions = [[int(y) if i > 0 else y for i, y in enumerate(x.decode().split(' '))] for x in iterator] return int(iprLine[1]), instructions
def getData(): return download('https://adventofcode.com/2018/day/15/input')
def create_alexnet(): helpers.mkdir('data') numpy_data_path = os.path.join('data', 'bvlc_alexnet.npy') download_url = 'https://www.dropbox.com/s/gl5wa3uzru555nd/bvlc_alexnet.npy?dl=1' print('Downloading pre-trained AlexNet weights.') helpers.download(download_url, numpy_data_path) print('Weights downloaded.') variable_data = np.load(numpy_data_path, encoding='bytes').item() conv1_preW = variable_data["conv1"][0] conv1_preb = variable_data["conv1"][1] conv2_preW = variable_data["conv2"][0] conv2_preb = variable_data["conv2"][1] conv3_preW = variable_data["conv3"][0] conv3_preb = variable_data["conv3"][1] conv4_preW = variable_data["conv4"][0] conv4_preb = variable_data["conv4"][1] conv5_preW = variable_data["conv5"][0] conv5_preb = variable_data["conv5"][1] fc6_preW = variable_data["fc6"][0] fc6_preb = variable_data["fc6"][1] fc7_preW = variable_data["fc7"][0] fc7_preb = variable_data["fc7"][1] fc8_preW = variable_data["fc8"][0] fc8_preb = variable_data["fc8"][1] pixel_depth = 255.0 resized_height = 227 resized_width = 227 num_channels = 3 print('Creating AlexNet model.') graph = tf.Graph() with graph.as_default(): x = tf.placeholder(tf.uint8, [None, None, None, num_channels], name='input') to_float = tf.cast(x, tf.float32) resized = tf.image.resize_images(to_float, [resized_height, resized_width]) # Convolution 1 with tf.name_scope('conv1') as scope: kernel = tf.Variable(conv1_preW, name='weights') biases = tf.Variable(conv1_preb, name='biases') conv = tf.nn.conv2d(resized, kernel, [1, 4, 4, 1], padding="SAME") bias = tf.nn.bias_add(conv, biases) conv1 = tf.nn.relu(bias, name=scope) # Local response normalization 2 radius = 2 alpha = 2e-05 beta = 0.75 bias = 1.0 lrn1 = tf.nn.local_response_normalization(conv1, depth_radius=radius, alpha=alpha, beta=beta, bias=bias) # Maxpool 1 pool1 = tf.nn.max_pool(lrn1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool1') # Convolution 2 with tf.name_scope('conv2') as scope: kernel = tf.Variable(conv2_preW, name='weights') biases = tf.Variable(conv2_preb, name='biases') input_a, input_b = tf.split(pool1, 2, 3) kernel_a, kernel_b = tf.split(kernel, 2, 3) with tf.name_scope('A'): conv_a = tf.nn.conv2d(input_a, kernel_a, [1, 1, 1, 1], padding="SAME") with tf.name_scope('B'): conv_b = tf.nn.conv2d(input_b, kernel_b, [1, 1, 1, 1], padding="SAME") conv = tf.concat([conv_a, conv_b], 3) bias = tf.nn.bias_add(conv, biases) conv2 = tf.nn.relu(bias, name=scope) # Local response normalization 2 radius = 2 alpha = 2e-05 beta = 0.75 bias = 1.0 lrn2 = tf.nn.local_response_normalization(conv2, depth_radius=radius, alpha=alpha, beta=beta, bias=bias) # Maxpool 2 pool2 = tf.nn.max_pool(lrn2, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool2') with tf.name_scope('conv3') as scope: kernel = tf.Variable(conv3_preW, name='weights') biases = tf.Variable(conv3_preb, name='biases') conv = tf.nn.conv2d(pool2, kernel, [1, 1, 1, 1], padding="SAME") bias = tf.nn.bias_add(conv, biases) conv3 = tf.nn.relu(bias, name=scope) with tf.name_scope('conv4') as scope: kernel = tf.Variable(conv4_preW, name='weights') biases = tf.Variable(conv4_preb, name='biases') input_a, input_b = tf.split(conv3, 2, 3) kernel_a, kernel_b = tf.split(kernel, 2, 3) with tf.name_scope('A'): conv_a = tf.nn.conv2d(input_a, kernel_a, [1, 1, 1, 1], padding="SAME") with tf.name_scope('B'): conv_b = tf.nn.conv2d(input_b, kernel_b, [1, 1, 1, 1], padding="SAME") conv = tf.concat([conv_a, conv_b], 3) bias = tf.nn.bias_add(conv, biases) conv4 = tf.nn.relu(bias, name=scope) with tf.name_scope('conv5') as scope: kernel = tf.Variable(conv5_preW, name='weights') biases = tf.Variable(conv5_preb, name='biases') input_a, input_b = tf.split(conv4, 2, 3) kernel_a, kernel_b = tf.split(kernel, 2, 3) with tf.name_scope('A'): conv_a = tf.nn.conv2d(input_a, kernel_a, [1, 1, 1, 1], padding="SAME") with tf.name_scope('B'): conv_b = tf.nn.conv2d(input_b, kernel_b, [1, 1, 1, 1], padding="SAME") conv = tf.concat([conv_a, conv_b], 3) bias = tf.nn.bias_add(conv, biases) conv5 = tf.nn.relu(bias, name=scope) # Maxpool 2 pool5 = tf.nn.max_pool(conv5, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='VALID', name='pool5') # Fully connected 6 with tf.name_scope('fc6'): weights = tf.Variable(fc6_preW, name='fc6_weights') bias = tf.Variable(fc6_preb, name='fc6_bias') shape = tf.shape(pool5) size = shape[1] * shape[2] * shape[3] z = tf.matmul(tf.reshape(pool5, [-1, size]), weights) + bias fc6 = tf.nn.relu(z, name='relu') # Fully connected 7 with tf.name_scope('fc7'): weights = tf.Variable(fc7_preW, name='weights') bias = tf.Variable(fc7_preb, name='bias') z = tf.matmul(fc6, weights) + bias fc7 = tf.nn.relu(z, name='relu') # Fully connected 8 with tf.name_scope('fc8'): weights = tf.Variable(fc8_preW, name='weights') bias = tf.Variable(fc8_preb, name='bias') fc8 = tf.matmul(fc7, weights) + bias softmax = tf.nn.softmax(fc8) init = tf.global_variables_initializer() print('Model created.') sess = tf.Session(graph=graph) sess.run(init) print('Exporting TensorBoard graph to tbout/alexnet') writer = tf.summary.FileWriter('tbout/alexnet', graph=graph) writer.close() print('Exporting TensorFlow model to data/alexnet') with graph.as_default(): saver = tf.train.Saver() save_path = saver.save(sess, 'data/alexnet')
######################################################################################## ### Subreddit loop and reply function ### Scan through subreddits and classifies food images ######################################################################################## for submission in subreddit.hot(limit=5): if submission.id not in reply_log.Post_ID.values: # Check that the bot hasn't already replied to this post if submission.stickied: # Ignore stickied posts e.g. rules continue # Download and save picture and thumbnail if submission_is_image: filename = submission.id + ".jpg" download(submission.url, filename, pic_dest) download(submission.thumbnail, filename, thumbnail_dest) # Call CNN to classify and estimate calories #classification, calories = food_CNN(pic_dest, filename) classification, calories = (None, None) # Reply to post and generate new log entry text = "Hi" # TODO: learn to format new_reply = send_reply(text, submission, classification, calories) # Save reply in dataframe and csv file save_reply(new_reply, reply_log, csv_file) else: print("Already replied to post")
def parse_detail(self, response): category = '' name = '' address = '' city = '' phone = '' fax = '' email = '' website = '' description = '' url = response.url or '' image_url = '' # check type lis = response.css('.comp-body li') trs = response.css('table.table.description tr') if len(lis) > 0: # type 1 for li in lis: k = li.css('::text').get().strip().split(':')[0].strip() v = li.css('::text').get().strip().split(':')[-1].strip() if len(k) == 0: continue if 'Company Name' in k: name = v elif 'Address' in k: address = v elif 'Telephone' in k: phone = li.css('a::text').get() elif 'Fax' in k: fax = v elif 'Email' in k: email = li.css('a::text').get() # description description = [] for p in response.css('.comp-row > p::text'): txt = p.get().strip() if len(txt) == 0 or 'Description' in txt: continue description.append(txt) description = ' '.join(description) # website website = response.css('.comp-row > p > a::attr(href)').get() or '' if self.name in website: website = '' # category category = response.css('.title-comp .col-sm-10::text')[-1].get() # image_url image_url = response.css('.img-container img::attr(src)').get() or '' elif len(trs) > 0: # type 2 for tr in trs: k = tr.css('td::text')[0].get() v = tr.css('td::text')[-1].get() if len(k) == 0: continue if 'Nama Perusahaan' in k: name = v elif 'Alamat' in k: address = tr.css('td')[-1].css('p::text').get() elif 'Kategori' in k: category = v elif 'Telepon' in k: phone = tr.css('td')[-1].css('a::text').get() elif 'Fax' in k: fax = tr.css('td')[-1].css('a::text').get() elif 'Email' in k: email = tr.css('td')[-1].css('a::text').get() # description description = [] for p in response.css('.container > p::text'): txt = p.get().strip() if len(txt) == 0: continue description.append(txt) description = ' '.join(description) # website website = response.css('a.btn.btn-contactus.btn-go-to::attr(href)').get() or '' if self.name in website: website = '' # image_url image_url = response.css('img.center-img::attr(src)').get() or '' if email is None or len(email) == 0: self.logger.info('{} : EMPTY EMAIL'.format(url)) email = '' if phone is None or len(phone) == 0: self.logger.info('{} : EMPTY PHONE'.format(url)) phone = '' # if len(email) > 0 and len(phone) > 0: name = helpers.fix_title(name) slug = helpers.get_slug(name) if image_url is not None and len(image_url) > 0: image_url = image_url.strip() ext = image_url.split('.')[-1] image_name = slug target_dir = 'images/{}/{}.{}'.format(self.name, image_name, ext) self.logger.info('downloading image: {} => {}'.format(image_url, target_dir)) r = helpers.download(image_url, target_dir) if not r: self.logger.info('Failed download {} => {}'.format(image_url, target_dir)) yield { 'category': category.strip(), 'name': name.strip(), 'slug': slug.strip(), 'address': address.strip(), 'city': city.strip(), 'phone': phone.strip(), 'email': email.strip(), 'website': website.strip(), 'description': description.strip(), 'url': url.strip(), }
def getData(): r = download('https://adventofcode.com/2018/day/25/input') return np.array([[int(y) for y in x.decode().split(',')] for x in r.iter_lines()])
def __init__(self, cache_dir): path = download(type(self).URL, cache_dir) lines = self._read(path) data, target = self._parse(lines) self.data, self.target = self._pad(data, target)
def getData(): r = download('https://adventofcode.com/2018/day/14/input') return int(r.text.strip())
def getData(): r = download('https://adventofcode.com/2018/day/1/input') return list(map((lambda x: int(x)), r.iter_lines()))
def getData(): r = download('https://adventofcode.com/2018/day/5/input') # Deliberately return the byte encoding return np.array([x[0] for x in r.iter_content() if x is not b'\n'])
def link_crawler(start_url, link_regex, robots_url=None, user_agent='statista', max_depth=-1, delay=3, proxies=None, num_retries=2, cache=None, scraper_callback=None): #: Initialze a crawl queue with a seed url to start the crawl from crawl_queue = [start_url] #: keep track of seen urls seen = {} robots = {} throttle = Throttle(delay) #: start the crawl while crawl_queue: url = crawl_queue.pop() #: robots.txt robots_file_present = False if 'http' not in url: continue #: Get the domain domain = '{}://{}'.format(urlparse(url).scheme, urlparse(url).netloc) #: Get the robot parser for this domain from the robots dictionary robot_parser = robots.get(domain) #: set a default robots url and a parser for it if there isn't one if not robot_parser and domain not in robots: robots_url = '{}/robots.txt'.format(domain) robot_parser = get_robots_parser(robots_url) if not robot_parser: #: continue to crawl even if there are problems finding robots.txt #: file robots_file_present = True # associate each domain with a corresponding parser, whether # present or not robots[domain] = robot_parser elif domain in robots: robots_file_present = True #: crawl only when url passes robots.txt restrictions if robots_file_present or robot_parser.can_fetch(user_agent, url): depth = seen.get(url, 0) if depth == max_depth: #: Skip link if you have crawled it more than max depth print('Skipping %s due to depth' % url) continue throttle.wait(url) html = download(url, num_retries=num_retries) if not html: continue if scraper_callback: scraper_callback(url, html) #: Get all links from page and filter only those matching given pattern for link in get_links(html): if re.search(link_regex, link): if 'http' not in link: # check if link is well formed and correct if link.startswith('//'): link = '{}:{}'.format(urlparse(url).scheme, link) elif link.startswith('://'): link = '{}{}'.format(urlparse(url).scheme, link) else: link = urljoin(domain, link) if link not in seen: seen[link] = depth + 1 crawl_queue.append(link) else: print('Blocked by robots.txt:', url)
def getData(): r = download('https://adventofcode.com/2018/day/2/input') return list(r.iter_lines())
def parse_detail(self, response): category = response.css('ol.breadcrumb.pull-left > li > a')[-1].css('::text').get() or '' name = response.css('h1.business-title span::text').get() or '' address = [] city = response.css('span[itemprop=addressLocality]::text').get() or '' phone = response.css('span[itemprop=telephone]::text').get() or '' email = '' website = response.css('ul.dropdown-menu > li > a[itemprop=url]::attr(href)').get() or '' description = [] url = response.url or '' # email try: cfemail = response.css('span.__cf_email__::attr(data-cfemail)').get() or '' if len(cfemail) > 0: email = helpers.cfDecodeEmail(cfemail) except: email = '' # address address_1 = response.css('h4 > span > span::text') address_2 = response.css('h4 > span::text') for index, a1 in enumerate(address_1): a1 = a1.get().strip() a2 = address_2[index].get().strip() address.append(a1) address.append(a2) address = ' '.join(address) address = address.replace(' ,', ',') # description for txt in response.css('.col-sm-12 > p p'): d = txt.css('::text').get() or '' description.append(d.strip()) description = '. '.join(description) description = description.replace('..', '.') description = description.replace('. . ', '. ') description = description.replace('. . ', '. ') if len(email) == 0: self.logger.info('{} : EMPTY EMAIL'.format(url)) if len(phone) == 0: self.logger.info('{} : EMPTY PHONE'.format(url)) if len(email) > 0 and len(phone) > 0: image_url = response.css('.detail-listing-img > img::attr(src)').get() if image_url is not None and image_url[-1] != '/': image_url = image_url.strip() ext = image_url.split('.')[-1] image_name = helpers.get_slug(helpers.fix_title(name)) target_dir = 'images/{}/{}'.format(self.name, image_name) self.logger.info('downloading image: {} => {}'.format(image_url, target_dir)) helpers.download(image_url, target_dir) yield { 'category': category.strip(), 'name': name.strip(), 'address': address.strip(), 'city': city.strip(), 'phone': phone.strip(), 'email': email.strip(), 'website': website.strip(), 'description': description.strip(), 'url': url.strip(), }
def getData(): r = download('https://adventofcode.com/2018/day/4/input') # Make an array for the year 1518 (which is not a leap year) initialData = [None] * 12 for i, v in enumerate([31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]): initialData[i] = [None] * v regex = re.compile(r'^\[(\d+)-(\d+)-(\d+) (\d+):(\d+)\] (?:(w)akes up|(f)alls asleep|Guard #(\d+) begins shift)$') guardSeen = [] guardMap = [] guardMaxDays = 0 for line in [x.decode() for x in r.iter_lines()]: linematch = regex.match(line) # Two of the last 3 groups (w,f,g) will return None since they weren't matched _, m, d, H, M, w, f, g = [(x if not x or x in ['w', 'f'] else int(x)) for x in linematch.groups()] # Handle a special case where guards start just before midnight if H == 23: # Roll over a month if we're on the last day, else add a day if d > len(initialData[m]): m += 1 d = 1 else: d += 1 # Days and months are 1-indexed, so decrement one now to get the index m -= 1 d -= 1 # Initialise this entry in the array if it hasn't been seen yet if not initialData[m][d]: initialData[m][d] = {'g': 0, 'w': [], 'f': []} if w or f: # This line is a wake or fall asleep action action = w if w else f # Store the minutes at which this action happened on this day in reverse order - initialData[m][d][action] = sorted(initialData[m][d][action][:] + [M], reverse=True) else: # Then this is a guard shift start # If we haven't seen this guard before, add them to the guardMap # Maintaining a map like this rather than indexing on the raw id compacts the dataset a lot # For the input I had, this reduces it from (3468, 19, 60) to only (22, 19, 60) if g not in guardMap: guardMapIndex = len(guardMap) guardMap.append(g) else: guardMapIndex = guardMap.index(g) # Set the guardMap index for this date initialData[m][d]['g'] = guardMapIndex # Keep track of the number of times we've seen each guard if guardMapIndex < len(guardSeen): guardSeen[guardMapIndex] += 1 else: guardSeen.append(1) # Keep track of the most days we've seen any particular guard guardMaxDays = max(guardMaxDays, guardSeen[guardMapIndex]) # Build the template for the numpy array, in shape (guard count, guardMaxDays, 60) # Add one to guardMaxId to avoid 0-index issues template = [None] * len(guardMap) for i in range(len(guardMap)): template[i] = [None] * guardMaxDays for j in range(guardMaxDays): template[i][j] = [False] * 60 data = np.array(template) # Loop over the intial data again to convert it into something a little more useful for month in initialData: for day in month: # We don't have data for some days if day: g = day['g'] # We use the seen counter to track where we insert this day's data # Decrement comes first because of 0-indexing guardSeen[g] -= 1 while len(day['f']) > 0: # Pop the earliest pair of falling/waking off the end of the lists # This works because we sorted in reverse order before f = day['f'].pop() w = day['w'].pop() # Set each minute in the interval to True for this (guard id, seen count) pair data[g][guardSeen[g]][f:w] = [True for _ in range(f, w)] return data, guardMap