コード例 #1
0
ファイル: courtnic.py プロジェクト: sushant354/judis-re
    def get_judgment(self, url, relpath, metainfo):
        filename = utils.url_to_filename(url, False, ["yID", "nID", "ID"])
        if not filename:
            self.logger.warning(u"No filename for %s" % url)
            return

        rel = os.path.join(relpath, filename)
        filepath = os.path.join(self.rawdir, rel)

        if os.path.exists(filepath):
            self.logger.info(u"Already exists %s" % filepath)
        else:
            self.logger.info(u"Downloading %s" % url)
            webpage = self.download_url(url, loadcookies=self.cookiefile.name)
            if not webpage:
                self.logger.warning(u"Could not download %s" % url)
                return

            utils.save_file(filepath, webpage)
            self.logger.info(u"Saved %s" % filepath)

        if os.path.exists(filepath):
            metapath = os.path.join(self.metadir, rel)
            if metainfo and (self.updateMeta or not os.path.exists(metapath)):
                utils.print_tag_file(metapath, metainfo)

        return rel
コード例 #2
0
    def get_judgment(self, url, relpath, metainfo):
        filename = utils.url_to_filename(url, False, ['yID', 'nID', 'ID'])
        if not filename:
            self.logger.warning(u'No filename for %s' % url)
            return

        rel = os.path.join(relpath, filename)
        filepath = os.path.join(self.rawdir, rel)

        if os.path.exists(filepath):
            self.logger.info(u'Already exists %s' % filepath)
        else:
            self.logger.info(u'Downloading %s' % url)
            webpage = self.download_url(url, loadcookies=self.cookiefile.name)
            if not webpage:
                self.logger.warning(u'Could not download %s' % url)
                return

            utils.save_file(filepath, webpage)
            self.logger.info(u'Saved %s' % filepath)

        if os.path.exists(filepath):
            metapath = os.path.join(self.metadir, rel)
            if metainfo and (self.updateMeta or not os.path.exists(metapath)):
                utils.print_tag_file(metapath, metainfo)

        return rel
コード例 #3
0
def main(user_url: str) -> None:
    url, params = extract_params(user_url)
    cur_page_num = get_current_page(params)
    last_page_num = get_last_page(user_url, cur_page_num)
    params = params.replace(f'p={cur_page_num}', '')
    # new_url is url without p=123 parmeter
    new_url = f'{url}?{params}'

    filename = url_to_filename(new_url, substitute='-', ext='.csv', _os='win')
    DATA_FILENAME = 'data_' + filename
    URLS_FILENAME = 'urls_' + filename
    new_global('DATA_FILENAME', DATA_FILENAME)
    new_global('URLS_FILENAME', URLS_FILENAME)

    bot = None
    if USE_BOT:
        # start browser
        bot = Browser(headless=HEADLESS,
                      proxy=get_global('PROXY'),
                      driverpath=WEBDRIVERPATH)
    new_global('BOT', bot)

    item_urls = []
    if urls_data := load_data(URLS_FILENAME):
        item_urls: List[str] = urls_data.split('\n')
コード例 #4
0
def get_complete_recipes(recipes, image_list):
    """Return intersection of recipe keys and image keys."""
    recipe_keys = [url_to_filename(k) for k in recipes.keys()]
    files = np.array([filename for filename in image_list.keys()
                      if filename in recipe_keys])
    print('{:,} complete recipes found'.format(len(files)))
    return files
コード例 #5
0
def clean_recipe_keys(recipes):
    """Clean recipe keys by stripping URLs of special characters
    """
    recipes_clean = {}
    for key, value in recipes.items():
        recipes_clean[url_to_filename(key)] = value
    return recipes_clean
コード例 #6
0
def get_complete_recipes(recipes, image_list):
    """Return intersection of recipe keys and image keys."""
    recipe_keys = [url_to_filename(k) for k in recipes.keys()]
    files = np.array([
        filename for filename in image_list.keys() if filename in recipe_keys
    ])
    print('{:,} complete recipes found'.format(len(files)))
    return files
コード例 #7
0
ファイル: hello.py プロジェクト: lphk92/NoServerBlog
def edit_post(post_name=None):
    print "Post Name: ", post_name
    if post_name == None:
        return "We'll manage all posts from here"
    else:
        filename = utils.url_to_filename(post_name)
        print "Looking for file:", filename
        if os.path.isfile(filename):
            p = Post.readFromFile(filename)
            return render_template("edit_post.html", post=p)
        else:
            return "No file with name: " + filename
コード例 #8
0
    def save_page(self, page, save_path):
        """
        Save page to file, if url matches target_url pattern
        """
        try:
            logging.info("Match target_url, saving page of url:%s" %
                         self.url.url)
            save_filename = url_to_filename(self.url.url)
            save_filepath = os.path.join(save_path, save_filename)
            with open(save_filepath, 'w') as fin:
                fin.write(page)

        except Exception as err:
            raise Exception("Save_page failed.\n%s" % err)
コード例 #9
0
ファイル: utils_test.py プロジェクト: cash2one/mini_spider
    def test_url_to_file(self):
        """Test url_to_file function"""
        file_name = utils.url_to_filename('http://www.baidu.com')
        target_name = 'http:%2F%2Fwww.baidu.com'

        self.assertEqual(file_name, target_name)
コード例 #10
0
ファイル: gujarat.py プロジェクト: edudemy/judis-re
    def download_oneday(self, relpath, dateobj):
        newdls  = []

        pageurl = urllib.basejoin(self.baseurl, '/gujarathc/')

        datestr = utils.dateobj_to_str(dateobj, '-')
        dateurl = pageurl + 'orderdatewisedata.jsp?fdate=%s&tdate=%s' % \
                                (datestr, datestr)

        webpage = self.download_url (dateurl, referer = self.baseurl, \
                                     loadcookies = self.cookiefile.name)

        if not webpage:
            self.logger.warning(u'No webpage for %s' % dateurl)            
            return newdls

        webpage = re.sub('(?P<windowopen>window.open\([^)]+\))', \
                         self.sanitize_windowopen, webpage)

        d = utils.parse_webpage(webpage)

        if not d:
            self.logger.error(u'Could not parse html of the result page for date %s' % dateobj)
            return newdls

        trs = d.findAll('tr')
        for tr in trs:
            link = tr.find('a')
            if not link:
                self.logger.info(u'No link in %s' % tr)
                continue

            href = link.get('onclick')
            if not href:
                self.logger.info(u'No href in %s' % tr)
                continue

            reobj = re.search("showoj.jsp?[^'\s]+", href)

            (start, end) = reobj.span()

            pagerelurl = href[start:end]          
            url = urllib.basejoin(pageurl, pagerelurl)

            filename = utils.url_to_filename(url, False, ['caseyr', 'caseno', \
                                                          'casetype'])

            if not filename:
                self.logger.error(u'Could not get filename for %s' % url)
                continue
            relurl   = os.path.join(relpath, filename)
            filepath = os.path.join(self.rawdir, relurl)
            metapath = os.path.join(self.metadir, relurl)

            if not os.path.exists(filepath):
                self.logger.info(u'Downloading %s %s' % (url, filename))
                j = self.download_url(url, loadcookies = self.cookiefile.name)
                 
                if not j:
                    self.logger.warning(u'No webpage: %s' % url)
                else:
                    self.logger.info(u'Saving %s' % filepath)
                    utils.save_file(filepath, j)
                    newdls.append(relurl)
           
            if os.path.exists(filepath) and \
                    (self.updateMeta or not os.path.exists(metapath)):
                metainfo = self.get_meta_info(link, tr, dateobj)
                if metainfo:
                    utils.print_tag_file(metapath, metainfo)

        return newdls
コード例 #11
0
def clean_recipe_keys(recipes):
    """Clean recipe keys by stripping URLs of special characters."""
    recipes_clean = {}
    for key, value in recipes.items():
        recipes_clean[url_to_filename(key)] = value
    return recipes_clean
コード例 #12
0
ファイル: hello.py プロジェクト: lphk92/NoServerBlog
def show_post(post_name):
    filename = utils.url_to_filename(post_name)
    if os.path.isfile(filename):
        p = Post.readFromFile(filename)
        return render_template("post_pretty.html", post=p)
    return "Failure"