def test_link_1(self): elements = map(TestElement, range(10)) links = Links(elements, distance) distances = links.values() self.assertTrue(max(distances) == 2) self.assertEqual(elements[0].linked, [elements[1], elements[2]]) self.assertEqual(links.info(elements[2], elements[4]), 2) self.assertIsNone(links.info(elements[2], elements[5]), None)
def test_link_1(self): elements = map(TestElement, range(10)) links = Links(elements, distance) distances = links.values() self.assertTrue( max(distances)==2 ) self.assertEqual(elements[0].linked, [elements[1], elements[2]]) self.assertEqual(links.info(elements[2], elements[4]), 2) self.assertIsNone(links.info(elements[2], elements[5]), None)
def visited_domains(): try: frm, to = get_visited_domains_params(request.args, ("from", "to")) except Exception as e: return make_json_response(error=e, code=HTTPStatus.BAD_REQUEST) try: data = redis_connector.get_by_range(frm, to) except Exception as e: return make_json_response(status=ERROR_INTERNAL, code=HTTPStatus.INTERNAL_SERVER_ERROR) links = Links(data) return make_json_response(key=DOMAINS_KEY, data=links.get_domains())
def get_all_asset_links(zettel_paths): all_Links = Links() # For each zettel. for zettel_path in zettel_paths: # Get the contents of the zettel. with open(zettel_path, 'r', encoding='utf8') as file: contents = file.read() # Get the asset links in the contents and save them # with all the other links in the Links object. links = get_asset_links(contents, zettel_path) all_Links.add(links) return all_Links
def reconstruct(self, simptcs, detector): self.pfinput = PFInput(simptcs) elements = self.pfinput.element_list() elements = merge_clusters(elements, 'hcal_in') elements = merge_clusters(elements, 'ecal_in') self.links = Links(elements, distance) self.pfreco = PFReconstructor(self.links, detector, self.logger)
def crawl(self, homepage): """ crawls web """ url = homepage rep = r.get(url) if rep.status_code == 200: soup = bs(rep.text, 'html.parser') links = soup.find_all('a') for l in links: link = (l.get('href')) if link: if not link.startswith("http://"): link = url.split('/', 3)[0] + "//" + url.split( '/', 3)[1] + url.split('/', 3)[2] + "/" + link if Links.process_links(link): self.count += 1 if self.count > 100: break if url[7:-9] not in link: break self.crawl(link)
def get_asset_links(contents, zettel_path): assert os.path.isabs(zettel_path) or zettel_path == '' links = Links() for link_match in re.finditer(asset_link_pattern, contents): link_dict = link_match.groupdict() # URLs that end with '.html' or '.htm' could be in the list. # Ignore them, but not locally saved files with those endings. if link_dict['link'].endswith('.html') or link_dict['link'].endswith( '.htm'): if html_link_is_URL(link_dict['link']): continue links.append(link_dict['link'], link_dict['name'], zettel_path) return links
def spi_response(self, response, *args, **kwargs): '''Response of the spi_request are handled here ''' if 'text/html' in response.headers['Content-Type']: hash_val = Hasher.HashMD5(response.content) if hash_val not in self.URLhash: self.URLhash.add(hash_val) self.URLset.union(Links.parse_link(response))
def spi_response(self, response): '''Response of the spi_request are handled here ''' if 'text/html' in response.headers[ 'Content-Type'] and response.status_code == 200: hash_val = Hasher.HashMD5(response.content) if self.redis.getVariable(hash_val) is None: if self.database.isConn(): self.database.saveData(hash=hash_val, url=response.url, content=response) self.redis.setVariable(hash_val, response.url) [self.URLset.put(link) for link in Links.parse_link(response)]
def download(self, keyword): source = Links() print('Collecting downloadable links of {}...'.format(keyword)) links = source.collect(keyword) print( 'Downloading images of {} from collected links...'.format(keyword)) self.mkdir('{}/{}'.format(self.path, keyword)) n_links = len(links) for index, link in enumerate(links): try: print( 'Downloading this image based on the keyword {} from {}: {}/{}' .format(keyword, link, index + 1, n_links)) response = requests.get(link, stream=True) ext = self.get_extension(link) raw_path = '{}/{}/{}'.format(self.path, keyword, str(index).zfill(4)) path = raw_path + '.' + ext self.save(response, path) del response print("Validating image file") ext2 = self.validate(path) if ext2 is None: print('Unreadable file - {}'.format(link)) os.remove(path) else: if ext != ext2: path2 = raw_path + '.' + ext2 os.rename(path, path2) print('Renaming extension {} -> {}'.format(ext, ext2)) except Exception as e: print('Download failed.', e) continue
def visited_links(): try: json = request.json except Exception as e: return make_json_response(error=e, code=HTTPStatus.BAD_REQUEST) try: links = Links.from_json(json) except Exception as e: return make_json_response(error=e, code=HTTPStatus.BAD_REQUEST) try: redis_connector.add(links.links, now_timestamp()) except Exception as e: return make_json_response(error=ERROR_INTERNAL, code=HTTPStatus.INTERNAL_SERVER_ERROR) return make_json_response()
def merge_clusters(elements, layer): merged = [] elem_in_layer = [] elem_other = [] for elem in elements: if elem.layer == layer: elem_in_layer.append(elem) else: elem_other.append(elem) links = Links(elem_in_layer, distance) for group in links.groups.values(): if len(group) == 1: merged.append(group[0]) continue supercluster = None for cluster in group: if supercluster is None: supercluster = copy.copy(cluster) merged.append(supercluster) continue else: supercluster += cluster merged.extend(elem_other) return merged
def _enhance_element_tree(self, e): for element in e.getiterator(): for child in list(element): if len(element.findall(child.tag)) > 1: setattr(element, child.tag, element.findall(child.tag)) elif len(list(child)) == 0: setattr(element, child.tag, child.text) else: setattr(element, child.tag, element.find(child.tag)) l = [] for element in e.getiterator('link'): d = { 'href': element.attrib.get('href'), 'rel': element.attrib.get('rel'), 'type': element.attrib.get('type') or 'application/xml' } l.append(d) e.links = lambda: Links(l) e.link = lambda x: e.links().get(x) return e
class Queue (object): ''' The download queue ''' def __init__ (self, builder, client): ''' Constructor ''' self.client = client # load the application settings self.settings = Gio.Settings.new ("org.pyLoader.queue") self.links = Links (builder, client) self.tree = builder.get_object ("queue_tree") # create the item store (packages) self.store = Gtk.ListStore (Package.__gtype__) self.store.set_sort_func (0, self.__store_compare, None) self.tree.set_model (self.store) # queue columns self.order_column = builder.get_object ("queue_order") self.name_column = builder.get_object ("queue_name") self.links_column = builder.get_object ("queue_links") self.size_column = builder.get_object ("queue_size") self.downloaded_column = builder.get_object ("queue_downloaded") self.speed_column = builder.get_object ("queue_speed") self.eta_column = builder.get_object ("queue_eta") self.progress_column = builder.get_object ("queue_progress") # create renderers order_renderer = Gtk.CellRendererText() name_renderer = Gtk.CellRendererText() links_renderer = Gtk.CellRendererText() size_renderer = Gtk.CellRendererText() downloaded_renderer = Gtk.CellRendererText() speed_renderer = Gtk.CellRendererText() eta_renderer = Gtk.CellRendererText() progress_renderer = Gtk.CellRendererProgress() # set column renderers self.order_column.pack_start (order_renderer, True) self.name_column.pack_start (name_renderer, True) self.links_column.pack_start (links_renderer, True) self.size_column.pack_start (size_renderer, True) self.downloaded_column.pack_start (downloaded_renderer, True) self.speed_column.pack_start (speed_renderer, True) self.eta_column.pack_start (eta_renderer, True) self.progress_column.pack_start (progress_renderer, True) self.order_column.set_cell_data_func (order_renderer, self.__render_order) self.name_column.set_cell_data_func (name_renderer, self.__render_name) self.links_column.set_cell_data_func (links_renderer, self.__render_links) self.size_column.set_cell_data_func (size_renderer, self.__render_size) self.downloaded_column.set_cell_data_func (downloaded_renderer, self.__render_downloaded) self.speed_column.set_cell_data_func (speed_renderer, self.__render_speed) self.eta_column.set_cell_data_func (eta_renderer, self.__render_eta) self.progress_column.set_cell_data_func (progress_renderer, self.__render_progress) # connect to ui events # self.tree.connect ("button-press-event", self.__on_button_press) selection = self.tree.get_selection() selection.connect ("changed", self.__on_selection_changed) # connect to client property events client.queue.added += self.__on_queue_added client.queue.changed += self.__on_queue_changed client.downloads.changed += self.__on_downloads_changed # load the queue column settings self.order_column.set_fixed_width (self.settings.get_uint ("column-order-size")) self.name_column.set_fixed_width (self.settings.get_uint ("column-name-size")) self.links_column.set_fixed_width (self.settings.get_uint ("column-links-size")) self.size_column.set_fixed_width (self.settings.get_uint ("column-size-size")) self.downloaded_column.set_fixed_width (self.settings.get_uint ("column-downloaded-size")) self.speed_column.set_fixed_width (self.settings.get_uint ("column-speed-size")) self.eta_column.set_fixed_width (self.settings.get_uint ("column-eta-size")) self.progress_column.set_fixed_width (self.settings.get_uint ("column-progress-size")) def save_state (self): self.settings.set_uint ("column-order-size", self.order_column.get_width()) self.settings.set_uint ("column-name-size", self.name_column.get_width()) self.settings.set_uint ("column-links-size", self.links_column.get_width()) self.settings.set_uint ("column-size-size", self.size_column.get_width()) self.settings.set_uint ("column-downloaded-size", self.downloaded_column.get_width()) self.settings.set_uint ("column-speed-size", self.speed_column.get_width()) self.settings.set_uint ("column-eta-size", self.eta_column.get_width()) self.settings.set_uint ("column-progress-size", self.progress_column.get_width()) def __render_order (self, column, cell, model, iter, data): # get the item we are dealing with item = model[iter][0] cell.set_property ("text", "{0}".format (item.order)) def __render_name (self, column, cell, model, iter, data): # get the item we are dealing with item = model[iter][0] cell.set_property ("text", item.name) def __render_links (self, column, cell, model, iter, data): # get the item we are dealing with item = model[iter][0] cell.set_property ("text", "{0}/{0} completed".format (item.links_done, item.links_total)) def __render_size (self, column, cell, model, iter, data): # get the item we are dealing with item = model[iter][0] total = utils.format_size (item.size_total) cell.set_property ("text", total) def __render_downloaded (self, column, cell, model, iter, data): # get the item we are dealing with item = model[iter][0] if item.size_done > 0: total = utils.format_size (item.size_done) cell.set_property ("text", total) else: cell.set_property ("text", "") def __render_speed (self, column, cell, model, iter, data): # get the item we are dealing with item = model[iter][0] if item.links_downloading: speed = 0 downloads = self.client.downloads.value for link in item.links.itervalues(): if downloads.has_key (link.id): speed += downloads[link.id].speed speed = utils.format_size (speed) cell.set_property ("text", "{0}/s".format (speed)) else: cell.set_property ("text", "") def __render_eta (self, column, cell, model, iter, data): # get the item we are dealing with item = model[iter][0] # link is active if item.links_downloading: eta = 0 downloads = self.client.downloads.value for link in item.links.itervalues(): if downloads.has_key (link.id): eta += downloads[link.id].eta eta = utils.format_time (eta) cell.set_property ("markup", eta) # link is waiting elif not item.links_downloading and item.links_waiting: eta = None downloads = self.client.downloads.value for link in item.links.itervalues(): if downloads.has_key (link.id): time_left = downloads[link.id].time_left if not eta: eta = time_left elif time_left < eta: eta = time_left eta = eta if eta > 0 else 0 eta = utils.format_time (eta) cell.set_property ("markup", "<small>Waiting - {0}</small>".format (eta)) # inactive link else: cell.set_property ("markup", "") def __render_progress (self, column, cell, model, iter, data): item = model[iter][0] percent = 0 downloads = self.client.downloads.value for link in item.links.itervalues(): if downloads.has_key (link.id): percent += downloads[link.id].percent elif link.status == Link.Status.FINISHED: percent += 100 cell.set_property ("value", percent / len(item.links)) def __store_compare (self, model, row1, row2, userdata): item1 = model[row1][0] item2 = model[row2][0] if item1.order < item2.order: return -1 elif item1.order == item2.order: return 0 else: return 1 def __on_queue_added (self, prop, package): ''' Handler to show newly added packages from the server ''' parent = self.store.append ([package]) def __on_queue_changed (self, prop, package): self.tree.queue_draw() def __on_downloads_changed (self, property, value): self.tree.queue_draw() def __on_button_press (self, widget, event): ''' Handler to show the popup menu in the queue ''' if event.type == Gdk.EventType.BUTTON_PRESS and event.button == 3: # get the current selection to determine which popup to use path, column, cell_x, cell_y = self.queue_tree.get_path_at_pos (event.x, event.y) iter = self.store.get_iter (path) # show the right context if iter and self.store[iter][0].is_link: link = self.store[iter][0] if link.offline: self.link_menu_failed.popup (None, None, None, None, event.button, event.time) elif link.active: self.link_menu_active.popup (None, None, None, None, event.button, event.time) return False def __on_selection_changed (self, selection): model, iter = selection.get_selected() package = model[iter][0] self.links.load (package)
def head_for_server(domain, url): target_url = domain + "/" + url print(target_url) headers = {} r = requests.head(target_url, headers=headers, allow_redirects=True, timeout=10) return { "url": r.url, "headers": r.headers } while True: unvisited_links = Links.get_unvisited_links(conn) if len(unvisited_links) == 0: print("Nothing to crawl, going to sleep") time.sleep(5) continue for link in unvisited_links: print("Going to {}".format(link["url"])) try: result = head_for_server(link["domain"], link["url"]) print("Got result for {}. It is {}".format(link["url"], result["url"])) Servers.insert_server(conn, link["link_id"], result["url"], result["headers"]["Server"]) except: pass
def __init__ (self, builder, client): ''' Constructor ''' self.client = client # load the application settings self.settings = Gio.Settings.new ("org.pyLoader.queue") self.links = Links (builder, client) self.tree = builder.get_object ("queue_tree") # create the item store (packages) self.store = Gtk.ListStore (Package.__gtype__) self.store.set_sort_func (0, self.__store_compare, None) self.tree.set_model (self.store) # queue columns self.order_column = builder.get_object ("queue_order") self.name_column = builder.get_object ("queue_name") self.links_column = builder.get_object ("queue_links") self.size_column = builder.get_object ("queue_size") self.downloaded_column = builder.get_object ("queue_downloaded") self.speed_column = builder.get_object ("queue_speed") self.eta_column = builder.get_object ("queue_eta") self.progress_column = builder.get_object ("queue_progress") # create renderers order_renderer = Gtk.CellRendererText() name_renderer = Gtk.CellRendererText() links_renderer = Gtk.CellRendererText() size_renderer = Gtk.CellRendererText() downloaded_renderer = Gtk.CellRendererText() speed_renderer = Gtk.CellRendererText() eta_renderer = Gtk.CellRendererText() progress_renderer = Gtk.CellRendererProgress() # set column renderers self.order_column.pack_start (order_renderer, True) self.name_column.pack_start (name_renderer, True) self.links_column.pack_start (links_renderer, True) self.size_column.pack_start (size_renderer, True) self.downloaded_column.pack_start (downloaded_renderer, True) self.speed_column.pack_start (speed_renderer, True) self.eta_column.pack_start (eta_renderer, True) self.progress_column.pack_start (progress_renderer, True) self.order_column.set_cell_data_func (order_renderer, self.__render_order) self.name_column.set_cell_data_func (name_renderer, self.__render_name) self.links_column.set_cell_data_func (links_renderer, self.__render_links) self.size_column.set_cell_data_func (size_renderer, self.__render_size) self.downloaded_column.set_cell_data_func (downloaded_renderer, self.__render_downloaded) self.speed_column.set_cell_data_func (speed_renderer, self.__render_speed) self.eta_column.set_cell_data_func (eta_renderer, self.__render_eta) self.progress_column.set_cell_data_func (progress_renderer, self.__render_progress) # connect to ui events # self.tree.connect ("button-press-event", self.__on_button_press) selection = self.tree.get_selection() selection.connect ("changed", self.__on_selection_changed) # connect to client property events client.queue.added += self.__on_queue_added client.queue.changed += self.__on_queue_changed client.downloads.changed += self.__on_downloads_changed # load the queue column settings self.order_column.set_fixed_width (self.settings.get_uint ("column-order-size")) self.name_column.set_fixed_width (self.settings.get_uint ("column-name-size")) self.links_column.set_fixed_width (self.settings.get_uint ("column-links-size")) self.size_column.set_fixed_width (self.settings.get_uint ("column-size-size")) self.downloaded_column.set_fixed_width (self.settings.get_uint ("column-downloaded-size")) self.speed_column.set_fixed_width (self.settings.get_uint ("column-speed-size")) self.eta_column.set_fixed_width (self.settings.get_uint ("column-eta-size")) self.progress_column.set_fixed_width (self.settings.get_uint ("column-progress-size"))
def main(args): global links links = Links() app.run(host='0.0.0.0', debug=args.debug, port=args.port)
def head_for_server(domain, url): target_url = domain + "/" + url print(target_url) headers = {} r = requests.head(target_url, headers=headers, allow_redirects=True, timeout=10) return {"url": r.url, "headers": r.headers} while True: unvisited_links = Links.get_unvisited_links(conn) if len(unvisited_links) == 0: print("Nothing to crawl, going to sleep") time.sleep(5) continue for link in unvisited_links: print("Going to {}".format(link["url"])) try: result = head_for_server(link["domain"], link["url"]) print("Got result for {}. It is {}".format(link["url"], result["url"])) Servers.insert_server(conn, link["link_id"], result["url"], result["headers"]["Server"]) except:
def links(self): r = self._link_header_to_array() return Links(r)
def links(self): """ Returns the Links of the header """ r = self._link_header_to_array() return Links(r)
for link in soup.find_all("a"): href = link.get("href") if href is None: continue if "start.bg" in href and "javascript:" not in href: inbound.add(href) elif "link.php" in href: outbound.add(href) else: others.add(href) return {"inbound": inbound, "outbound": outbound, "others": others} while True: domains_to_visit = Domains.get_all_unvisited_domains(conn) if len(domains_to_visit) == 0: break for domain_row in domains_to_visit: result = collect_domain(domain_row["domain"]) domain_id = domain_row["domain_id"] Domains.visit_domain(conn, domain_id) Domains.insert_domains(conn, result["inbound"]) Links.insert_links(conn, result["outbound"], domain_id)
import praw from linkextractor import LinkExtractor from history import History from logger import Logger from links import Links AGENT = 'Web scraper for /r/cscareerquestions. Made using PRAW' reddit = praw.Reddit(AGENT) history = History() log = Logger() link_file = Links() log.write('Getting subreddit...') cscareerquestions = reddit.get_subreddit('cscareerquestions') if cscareerquestions: log.write('Getting comments...') comments = cscareerquestions.get_comments() log.write('Getting submissions...') submissions = cscareerquestions.get_hot(limit = 10) log.write('Loading history...') all_history = history.get_history() log.write('Looping through recent comments...') for comment in comments: id = comment.id log.write('Handling comment ' + str(id) + '...')
#! /usr/bin/python from links import Links if __name__ == '__main__': l = Links() l.mainloop()
def setup(self): self.links = Links() self.main_page = MainPage(self.driver) self.data = Data() self.driver.get(self.links.landing)
return { "inbound": inbound, "outbound": outbound, "others": others } while True: domains_to_visit = Domains.get_all_unvisited_domains(conn) if len(domains_to_visit) == 0: break for domain_row in domains_to_visit: result = collect_domain(domain_row["domain"]) domain_id = domain_row["domain_id"] Domains.visit_domain(conn, domain_id) Domains.insert_domains(conn, result["inbound"]) Links.insert_links(conn, result["outbound"], domain_id) print("Sleeping for 20 seconds ZzZzz") time.sleep(20)
def __init__(self, driver, timeout=10): self.driver = driver self.driver.implicitly_wait(timeout) self.links = Links()