def get_judgment(self, url, relpath, metainfo): filename = utils.url_to_filename(url, False, ["yID", "nID", "ID"]) if not filename: self.logger.warning(u"No filename for %s" % url) return rel = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, rel) if os.path.exists(filepath): self.logger.info(u"Already exists %s" % filepath) else: self.logger.info(u"Downloading %s" % url) webpage = self.download_url(url, loadcookies=self.cookiefile.name) if not webpage: self.logger.warning(u"Could not download %s" % url) return utils.save_file(filepath, webpage) self.logger.info(u"Saved %s" % filepath) if os.path.exists(filepath): metapath = os.path.join(self.metadir, rel) if metainfo and (self.updateMeta or not os.path.exists(metapath)): utils.print_tag_file(metapath, metainfo) return rel
def get_judgment(self, url, relpath, metainfo): filename = utils.url_to_filename(url, False, ['yID', 'nID', 'ID']) if not filename: self.logger.warning(u'No filename for %s' % url) return rel = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, rel) if os.path.exists(filepath): self.logger.info(u'Already exists %s' % filepath) else: self.logger.info(u'Downloading %s' % url) webpage = self.download_url(url, loadcookies=self.cookiefile.name) if not webpage: self.logger.warning(u'Could not download %s' % url) return utils.save_file(filepath, webpage) self.logger.info(u'Saved %s' % filepath) if os.path.exists(filepath): metapath = os.path.join(self.metadir, rel) if metainfo and (self.updateMeta or not os.path.exists(metapath)): utils.print_tag_file(metapath, metainfo) return rel
def main(user_url: str) -> None: url, params = extract_params(user_url) cur_page_num = get_current_page(params) last_page_num = get_last_page(user_url, cur_page_num) params = params.replace(f'p={cur_page_num}', '') # new_url is url without p=123 parmeter new_url = f'{url}?{params}' filename = url_to_filename(new_url, substitute='-', ext='.csv', _os='win') DATA_FILENAME = 'data_' + filename URLS_FILENAME = 'urls_' + filename new_global('DATA_FILENAME', DATA_FILENAME) new_global('URLS_FILENAME', URLS_FILENAME) bot = None if USE_BOT: # start browser bot = Browser(headless=HEADLESS, proxy=get_global('PROXY'), driverpath=WEBDRIVERPATH) new_global('BOT', bot) item_urls = [] if urls_data := load_data(URLS_FILENAME): item_urls: List[str] = urls_data.split('\n')
def get_complete_recipes(recipes, image_list): """Return intersection of recipe keys and image keys.""" recipe_keys = [url_to_filename(k) for k in recipes.keys()] files = np.array([filename for filename in image_list.keys() if filename in recipe_keys]) print('{:,} complete recipes found'.format(len(files))) return files
def clean_recipe_keys(recipes): """Clean recipe keys by stripping URLs of special characters """ recipes_clean = {} for key, value in recipes.items(): recipes_clean[url_to_filename(key)] = value return recipes_clean
def get_complete_recipes(recipes, image_list): """Return intersection of recipe keys and image keys.""" recipe_keys = [url_to_filename(k) for k in recipes.keys()] files = np.array([ filename for filename in image_list.keys() if filename in recipe_keys ]) print('{:,} complete recipes found'.format(len(files))) return files
def edit_post(post_name=None): print "Post Name: ", post_name if post_name == None: return "We'll manage all posts from here" else: filename = utils.url_to_filename(post_name) print "Looking for file:", filename if os.path.isfile(filename): p = Post.readFromFile(filename) return render_template("edit_post.html", post=p) else: return "No file with name: " + filename
def save_page(self, page, save_path): """ Save page to file, if url matches target_url pattern """ try: logging.info("Match target_url, saving page of url:%s" % self.url.url) save_filename = url_to_filename(self.url.url) save_filepath = os.path.join(save_path, save_filename) with open(save_filepath, 'w') as fin: fin.write(page) except Exception as err: raise Exception("Save_page failed.\n%s" % err)
def test_url_to_file(self): """Test url_to_file function""" file_name = utils.url_to_filename('http://www.baidu.com') target_name = 'http:%2F%2Fwww.baidu.com' self.assertEqual(file_name, target_name)
def download_oneday(self, relpath, dateobj): newdls = [] pageurl = urllib.basejoin(self.baseurl, '/gujarathc/') datestr = utils.dateobj_to_str(dateobj, '-') dateurl = pageurl + 'orderdatewisedata.jsp?fdate=%s&tdate=%s' % \ (datestr, datestr) webpage = self.download_url (dateurl, referer = self.baseurl, \ loadcookies = self.cookiefile.name) if not webpage: self.logger.warning(u'No webpage for %s' % dateurl) return newdls webpage = re.sub('(?P<windowopen>window.open\([^)]+\))', \ self.sanitize_windowopen, webpage) d = utils.parse_webpage(webpage) if not d: self.logger.error(u'Could not parse html of the result page for date %s' % dateobj) return newdls trs = d.findAll('tr') for tr in trs: link = tr.find('a') if not link: self.logger.info(u'No link in %s' % tr) continue href = link.get('onclick') if not href: self.logger.info(u'No href in %s' % tr) continue reobj = re.search("showoj.jsp?[^'\s]+", href) (start, end) = reobj.span() pagerelurl = href[start:end] url = urllib.basejoin(pageurl, pagerelurl) filename = utils.url_to_filename(url, False, ['caseyr', 'caseno', \ 'casetype']) if not filename: self.logger.error(u'Could not get filename for %s' % url) continue relurl = os.path.join(relpath, filename) filepath = os.path.join(self.rawdir, relurl) metapath = os.path.join(self.metadir, relurl) if not os.path.exists(filepath): self.logger.info(u'Downloading %s %s' % (url, filename)) j = self.download_url(url, loadcookies = self.cookiefile.name) if not j: self.logger.warning(u'No webpage: %s' % url) else: self.logger.info(u'Saving %s' % filepath) utils.save_file(filepath, j) newdls.append(relurl) if os.path.exists(filepath) and \ (self.updateMeta or not os.path.exists(metapath)): metainfo = self.get_meta_info(link, tr, dateobj) if metainfo: utils.print_tag_file(metapath, metainfo) return newdls
def clean_recipe_keys(recipes): """Clean recipe keys by stripping URLs of special characters.""" recipes_clean = {} for key, value in recipes.items(): recipes_clean[url_to_filename(key)] = value return recipes_clean
def show_post(post_name): filename = utils.url_to_filename(post_name) if os.path.isfile(filename): p = Post.readFromFile(filename) return render_template("post_pretty.html", post=p) return "Failure"