def define_words(a_list): """ :param a_list: a list of words :return: alist of semicolon separated information about each word - word, type of word, example usage """ a_result = [] with open("words_wiki_500.txt", "w") as out_file: for word in a_list: """stahni stranku z cambridge najdi jednotlive casti pomoci regexpu sloz vysledek pridej do resultu """ clean_word = word.strip() logger.debug("word: %s" % clean_word) utils.download(get_page(clean_word), get_file_name(clean_word), logger) word_type = utils.get_info( get_file_name(clean_word), 'span class="headword">.*?%s.*?span class="pos".*?>(.*?)<' % clean_word, logger, ) out_line = "%s\t%s\n" % (clean_word, word_type) logger.debug(out_line) out_file.write(out_line) out_file.close()
def multiple_engine(self, song, lrc_path, artist, title): try: ret = False result = TTPlayer().request(artist, title) if result: if config.getboolean("lyrics", "auto_download"): ret = utils.download(result[0][2], lrc_path) if ret and self.vaild_lrc(lrc_path): return lrc_path else: os.unlink(lrc_path) duomi_result = DUOMI().request(artist, title) if duomi_result: if config.getboolean("lyrics", "auto_download"): ret = utils.download(duomi_result[0][2], lrc_path, "gbk") if ret and self.vaild_lrc(lrc_path): return lrc_path else: os.unlink(lrc_path) soso_result = SOSO().request(artist, title) if soso_result: if config.getboolean("lyrics", "auto_download"): ret = utils.download(soso_result[0][2], lrc_path, "gb18030") if ret and self.vaild_lrc(lrc_path): return lrc_path else: os.unlink(lrc_path) except: return None
def fetch_house_committee_meetings(committees, options): # Load any existing meetings file so we can recycle any GUIDs. existing_meetings = [] output_file = output_for("house") if os.path.exists(output_file): existing_meetings = json.load(open(output_file)) opts = dict(options) opts["binary"] = True opts["force"] = True meetings = [] seen_meetings = set() # Scrape the committee listing page for a list of committees with scrapable events. committee_html = utils.download( "http://docs.house.gov/Committee/Committees.aspx", "committee_schedule/house_overview.html", options ) for cmte in re.findall(r'<option value="(....)">', committee_html): if cmte not in committees: logging.error("Invalid committee code: " + cmte) continue # Download the feed for this committee. html = utils.download( "http://docs.house.gov/Committee/RSS.ashx?Code=%s" % cmte, "committee_schedule/house_%s.xml" % cmte, opts ) # It's not really valid? html = html.replace( " ", " " ) # who likes nbsp's? convert to spaces. but otherwise, entity is not recognized. # print html # Parse and loop through the meetings listed in the committee feed. dom = lxml.etree.fromstring(html) # original start to loop for mtg in dom.xpath("channel/item"): eventurl = unicode(mtg.xpath("string(link)")) event_id = re.search(r"EventID=(\d+)$", eventurl).group(1) pubDate = datetime.datetime.fromtimestamp(mktime(parsedate(mtg.xpath("string(pubDate)")))) # skip old records of meetings, some of which just give error pages if pubDate < (datetime.datetime.now() - datetime.timedelta(days=60)): continue # Events can appear in multiple committee feeds if it is a joint meeting. if event_id in seen_meetings: logging.info("Duplicated multi-committee event: " + event_id) continue seen_meetings.add(event_id) # this loads the xml from the page and sends the xml to parse_house_committee_meeting load_xml_from_page(eventurl, options, existing_meetings, committees, event_id, meetings) # if bad zipfile if load_xml_from_page == False: continue print "[house] Found %i meetings." % len(meetings) return meetings
def extract_content(self): classes = self.extract_classes() for klass in classes[1:]: # Exclude ONLINE CLASS folder_name = remove_accents(klass['class']) create_folder(folder_name) print('Extracting Class: {0}'.format(klass['class'])) self.browser.get('https://unipac-bomdespacho.blackboard.com{0}'.format(klass['href'])) self.browser.find_element_by_id('header::0-whatsNewView::CO').click() # Open content list block_class_contents = self.browser.find_element_by_id('block::0-whatsNewView::CO') class_contents = block_class_contents.find_elements_by_css_selector( "a[onclick*='nautilus_utils.actionSelected']" ) i_content = 0 for i_content in range(i_content, len(class_contents)): try: block_classes_contents = self.browser.find_element_by_id('block::0-whatsNewView::CO') class_contents = block_classes_contents.find_elements_by_css_selector( "a[onclick*='nautilus_utils.actionSelected']" ) class_contents[i_content].click() self.check_visibility(By.CLASS_NAME, "individualContent-link") file_link = self.browser.find_element_by_class_name('individualContent-link').get_attribute('href') cookies = self.browser.get_cookies() download(cookies, file_link, folder_name) self.browser.back() self.check_visibility(By.ID, "block::0-whatsNewView::CO") except TimeoutException: print("Error in: {0} - {1}".format(klass['class'], klass['href']))
def populate_events(ap_args): """Main function to populate the database with archive events. :Parameters: - ap_args : dict : Information related to archive(s). """ FLOG.debug(ap_args) if __name__ != '__main__': ap_args = check_if_params(ap_args) check_arg_for_none_value(ap_args) CLOG.info('DB Populate args :- ' + str(ap_args)) arch_path = ap_args.get('temp') del_arch_path = ap_args.get('delete_temp') if not arch_path: arch_path = gisc_msgs.TEMP_LOCATION arg = [k for k in ap_args if k is not 'temp'][0] if arg is 'url': empty_directory(arch_path) download(ap_args[arg], arch_path) elif arg is 'arch_date': empty_directory(arch_path) download(get_url_from_date(ap_args[arg]), arch_path) elif arg is 'src_dir': arch_path = ap_args[arg] elif arg is 'zip': extract_zip(ap_args[arg], arch_path) elif arg is 'files': empty_directory(arch_path) map(lambda x: handle_valid_invalid_files(x, arch_path), ap_args[arg]) populate(arch_path) if arg is not 'src_dir' and del_arch_path: empty_directory(arch_path, False)
def main(): with tf.variable_scope('input') as scope: # use variable instead of placeholder because we're training the intial image to make it # look like both the content image and the style image input_image = tf.Variable(np.zeros([1, IMAGE_HEIGHT, IMAGE_WIDTH, 3]), dtype=tf.float32) utils.download(VGG_DOWNLOAD_LINK, VGG_MODEL, EXPECTED_BYTES) model = vgg_model.load_vgg(VGG_MODEL, input_image) model['global_step'] = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step') content_image = utils.get_resized_image(CONTENT_IMAGE, IMAGE_HEIGHT, IMAGE_WIDTH) content_image = content_image - MEAN_PIXELS style_image = utils.get_resized_image(STYLE_IMAGE, IMAGE_HEIGHT, IMAGE_WIDTH) style_image = style_image - MEAN_PIXELS model['content_loss'], model['style_loss'], model['total_loss'] = _create_losses(model, input_image, content_image, style_image) ############################### ## TO DO: create optimizer model['optimizer'] = tf.train.AdamOptimizer(LR).minimize(model['total_loss'], global_step=model['global_step']) ############################### model['summary_op'] = _create_summary(model) initial_image = utils.generate_noise_image(content_image, IMAGE_HEIGHT, IMAGE_WIDTH, NOISE_RATIO) train(model, input_image, initial_image)
def get(gist_id, requested_file, destination_dir, facade): """ Download a gist file. Gists can have several files. This method searches for and downloads a single file from a gist. If the 'requested_file' is not informed, then it won't raise an error only if the gist have just a single file. :param gist_id: identifier of the gist to download :param requested_file: name of the Gist file to download :param destination_dir: destination directory after the download :param facade: instance of the object that actually perform the request """ # Get the gist information response = facade.request_gist(gist_id) if response.ok: # Gist file found. Parse it into a 'model.Gist' class. gist_obj = model.Gist(get_json(response)) list_names = [gistfile.filename for gistfile in gist_obj.files] if len(gist_obj.files) == 1 and not requested_file: # Download the only file in the gist gistfile = gist_obj.files[0] download(gistfile.raw_url, destination_dir, gistfile.filename, gistfile.size) result = build_result(True, literals.DOWNLOAD_OK, gistfile.filename) else: # Gist have more than one file and filename not specified. Error if not requested_file: list_names = ", ".join(list_names) result = build_result(False, literals.DOWNLOAD_MORE_FILES, list_names) else: # Search for the Gist file gistfile = gist_obj.getFile(requested_file) if gistfile: # Gist file found. Download it. download(gistfile.raw_url, destination_dir, gistfile.filename, gistfile.size) result = build_result(True, literals.DOWNLOAD_OK, gistfile.filename) else: # Requested file not found in Gist list_of_names = ", ".join(list_names) result = build_result(False, literals.FILE_NOT_FOUND, list_of_names) else: # Handle GitHub response error result = build_result(False, literals.DOWNLOAD_ERROR, get_json(response)['message']) return result
def install(self, args=None): download('http://www.lacusveris.com/PythonTidy/PythonTidy-1.16.python', 'pythontidy.py') mkdir(finalTidyDir) shutil.move(join(alePath('tmp'), 'pythontidy.py'), finalTidyPath) os.system('chmod +x %s' % finalTidyPath) logging.info('Patching tidy to wrap at 120 columns instead of 80 ...') os.system('patch %s %s' % (finalTidyPath, join(alePath('recipes_all/tidy/'), 'tidy80col.patch')))
def main(url=ZIP_CODE_URL): path = utils.DATASET_HOME + ADDRESS_ZIP utils.download(url, path) files = utils.unzip(path) if files and len(files) > 0: write_address(files[0]) else: print("failed to download or unzip the file. please see at {0}.".format(utils.DATASET_HOME))
def __init__(self, input_img): # 下载文件 utils.download(VGG_DOWNLOAD_LINK, VGG_FILENAME, EXPECTED_BYTES) # 加载文件 self.vgg_layers = scipy.io.loadmat(VGG_FILENAME)["layers"] self.input_img = input_img # VGG在处理图像时候会将图片进行mean-center,所以我们首先要计算RGB三个channel上的mean self.mean_pixels = np.array([123.68, 116.779, 103.939]).reshape((1, 1, 1, 3))
def _download(self, src, dst_path): """download a file""" src_file = os.path.basename(src) dst = os.path.join(dst_path, src_file) if os.path.isfile(dst): logging.info('"%s" already exists, download skipped', dst) else: utils.make_dirs(dst_path) utils.download(src, dst)
def download_report(report): report_path = "%s/%s/%s/report.%s" % (report['inspector'], report['year'], report['report_id'], report['file_type']) binary = (report['file_type'] == 'pdf') utils.download( report['url'], "%s/%s" % (utils.data_dir(), report_path), {'binary': binary} ) return report_path
def do_snapshot_download(self, args): '''Download a SNAPSHOT''' snapshot = args.snapshot body = self.client.snapshot_download(snapshot=snapshot) result = utils.loads(body) result.pop() if len(result) == 0: print('Snapshot %s does not exist.' % snapshot) return -1 uri=result.pop() utils.download(uri)
def multiple_engine(self, song, lrc_path, artist, title): try: ret = False ting_result = poster.query_lrc_info(artist, title) if ting_result: urls = [item[2] for item in ting_result] for url in urls: ret = utils.download(url, lrc_path) if ret: return lrc_path result = TTPlayer().request(artist, title) if result: urls = [item[2] for item in result] for url in urls: ret = utils.download(url, lrc_path) if ret and self.vaild_lrc(lrc_path): return lrc_path ttpod_result = TTPod().request_data(artist, title) if ttpod_result: with open(lrc_path, 'wb') as fp: fp.write(ttpod_result) return lrc_path duomi_result = DUOMI().request(artist, title) if duomi_result: urls = [item[2] for item in duomi_result] for url in urls: ret = utils.download(url, lrc_path, "gbk") if ret and self.vaild_lrc(lrc_path): return lrc_path soso_result = SOSO().request(artist, title) if soso_result: urls = [item[2] for item in soso_result] for url in urls: ret = utils.download(url, lrc_path, "gb18030") if ret and self.vaild_lrc(lrc_path): return lrc_path try: os.unlink(lrc_path) except: pass return None except Exception, e: print e return None
def download_to_cache(meta): if not isdir(SRC_CACHE): os.makedirs(SRC_CACHE) fn = meta['fn'] md5 = meta.get('md5') path = join(SRC_CACHE, fn) if not isfile(path): download(meta['url'], path, md5) if md5 and not md5_file(path) == md5: raise Exception("MD5 mismatch: %r" % meta) return path
def iso(self, name): """download ISOs""" dst = os.path.join(self.cfg['paths']['isos'], name + '.iso') if os.path.isfile(dst): logging.info('nothing to do, "%s" already exists', dst) return distribution = name.split('-', 1)[0] if distribution == 'ubuntu': url = self.iso_ubuntu(name) elif distribution == 'debian': url = self.iso_debian(name) utils.make_dirs(self.cfg['paths']['isos']) utils.download(url, dst)
def vote_ids_for_house(congress, session_year, options): vote_ids = [] index_page = "http://clerk.house.gov/evs/%s/index.asp" % session_year group_page = r"ROLL_(\d+)\.asp" link_pattern = r"http://clerk.house.gov/cgi-bin/vote.asp\?year=%s&rollnumber=(\d+)" % session_year # download index page, find the matching links to the paged listing of votes page = utils.download( index_page, "%s/votes/%s/pages/house.html" % (congress, session_year), options) if not page: logging.error("Couldn't download House vote index page, aborting") return None # extract matching links doc = html.document_fromstring(page) links = doc.xpath( "//a[re:match(@href, '%s')]" % group_page, namespaces={"re": "http://exslt.org/regular-expressions"}) for link in links: # get some identifier for this inside page for caching grp = re.match(group_page, link.get("href")).group(1) # download inside page, find the matching links page = utils.download( urlparse.urljoin(index_page, link.get("href")), "%s/votes/%s/pages/house_%s.html" % (congress, session_year, grp), options) if not page: logging.error("Couldn't download House vote group page (%s), aborting" % grp) continue doc = html.document_fromstring(page) votelinks = doc.xpath( "//a[re:match(@href, '%s')]" % link_pattern, namespaces={"re": "http://exslt.org/regular-expressions"}) for votelink in votelinks: num = re.match(link_pattern, votelink.get("href")).group(1) vote_id = "h" + num + "-" + str(congress) + "." + session_year if not should_process(vote_id, options): continue vote_ids.append(vote_id) return utils.uniq(vote_ids)
def download_chainfile(source_assembly, target_assembly): """ Download if needed, putting in the cache_dir. If the environmental variable HUBWARD_CACHE_DIR does not exist, then use ~/.hubward_cache """ cache_dir = os.environ.get( 'HUBWARD_CACHE_DIR', os.path.expanduser('~/.hubward_cache')) utils.makedirs(cache_dir) url = chainfile_url(source_assembly, target_assembly) dest = os.path.join(cache_dir, os.path.basename(url)) utils.download(url, dest) return dest
def fetch_version(bill_version_id, options): # Download MODS etc. logging.info("\n[%s] Fetching..." % bill_version_id) bill_type, number, congress, version_code = utils.split_bill_version_id(bill_version_id) # bill_id = "%s%s-%s" % (bill_type, number, congress) utils.download( mods_url_for(bill_version_id), document_filename_for(bill_version_id, "mods.xml"), utils.merge(options, {'binary': True, 'to_cache': False}) ) return write_bill_version_metadata(bill_version_id)
def POST(self, category): i = web.input('olid', author=None, file={}, source_url=None, success_url=None, failure_url=None) success_url = i.success_url or web.ctx.get('HTTP_REFERRER') or '/' failure_url = i.failure_url or web.ctx.get('HTTP_REFERRER') or '/' def error(code__msg): (code, msg) = code__msg print("ERROR: upload failed, ", i.olid, code, repr(msg), file=web.debug) _cleanup() url = changequery(failure_url, errcode=code, errmsg=msg) raise web.seeother(url) if i.source_url: try: data = download(i.source_url) except: error(ERROR_INVALID_URL) source_url = i.source_url elif i.file is not None and i.file != {}: data = i.file.value source_url = None else: error(ERROR_EMPTY) if not data: error(ERROR_EMPTY) try: save_image(data, category=category, olid=i.olid, author=i.author, source_url=i.source_url, ip=web.ctx.ip) except ValueError: error(ERROR_BAD_IMAGE) _cleanup() raise web.seeother(success_url)
def go_to_list_html(param, cookie, viewstate, retry): global proxy logger.info('搜索前置页面-开始初始搜索------ %s' % retry) if retry <= 0: return None url = 'http://ehire.51job.com/Candidate/SearchResumeNew.aspx' # param = {"function_code": "0107", "functionName": "软件工程师", "region_code": "010000", # "regionName": "北京"} data = arouse_utils.get_frist_post_headers(viewstate, param=param) logger.info(proxy) result = utils.download(url=url, data=data, proxy=proxy, cookie=cookie, method='post') if result['code'] != 0: logger.error("连接页面异常 ,重试: retry= %s" % retry) proxy = utils.get_proxy() return go_to_list_html(param, cookie, viewstate, retry - 1) elif '用户数不够' in result['data'] or len(result['data']) < 1000: logger.error("代理异常,重试: retry= %s" % retry) proxy = utils.get_proxy() return go_to_list_html(cookie, viewstate, retry - 1) if '您的操作过于频繁,请注意劳逸结合' in result['data']: return None return result['data']
def go_to_search_html(cookie, retry): global proxy if retry <= 0: return None logger.info('跳转搜索前置页面中------%s ' % retry) url = 'http://ehire.51job.com/Candidate/SearchResumeIndexNew.aspx' headers = arouse_utils.get_get_headers( 'http://ehire.51job.com/Navigate.aspx') if not proxy: proxy = utils.get_proxy() logger.info(proxy) utils_download = utils.download(url=url, headers=headers, proxy=proxy, cookie=cookie) if utils_download['code'] != 0: logger.error('搜索页面出错:%s %s' % (url, retry)) if utils_download.get( 'data' ) and '<a href="/MainLogin.aspx?returl=' in utils_download['data']: return 'login' proxy = utils.get_proxy() return go_to_search_html(cookie, retry - 1) if '<a href="/MainLogin.aspx?returl=' in utils_download['data']: return 'login' viewstate = arouse_utils.find( '<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />', utils_download['data']) if not viewstate: proxy = utils.get_proxy() go_to_search_html(cookie, retry - 1) return viewstate
def download_page(url=None, method=None, header=None, proxy=None, data=None, session=None, need_session=False): logger = utils.get_logger() if not proxy: proxy = local_proxy() for x in xrange(0, 5): result = utils.download(url=url, headers=header, data=data, method=method, need_session=need_session, session=session, proxy=proxy, allow_redirects=True, retry_time=1) if result['code'] == 0: logger.info('success when download %s ' % url) break else: proxy = local_proxy() result['code'] = 500 time.sleep(1) result['proxy'] = proxy return result
def candidate_for(bioguide): url = current_bioguide[bioguide]["terms"][-1].get("url", None) if not url: if debug: print "[%s] No official website, skipping" % bioguide return None if debug: print "[%s] Downloading..." % bioguide cache = "congress/%s.html" % bioguide body = utils.download(url, cache, force) all_matches = [] for regex in regexes[service]: matches = re.findall(regex, body, re.I) if matches: all_matches.extend(matches) if all_matches: for candidate in all_matches: passed = True for blacked in blacklist[service]: if re.search(blacked, candidate, re.I): passed = False if not passed: if debug: print "\tBlacklisted: %s" % candidate continue return candidate return None
def fetch_nomination(nomination_id, options={}): logging.info("\n[%s] Fetching..." % nomination_id) # fetch committee name map, if it doesn't already exist nomination_type, number, congress = utils.split_nomination_id(nomination_id) if not number: return {'saved': False, 'ok': False, 'reason': "Couldn't parse %s" % nomination_id} if not utils.committee_names: utils.fetch_committee_names(congress, options) # fetch bill details body body = utils.download( nomination_url_for(nomination_id), nomination_cache_for(nomination_id, "information.html"), options) if not body: return {'saved': False, 'ok': False, 'reason': "failed to download"} if options.get("download_only", False): return {'saved': False, 'ok': True, 'reason': "requested download only"} ''' # TO DO ## detect group nominations, particularly for military promotions ## detect when a group nomination is split into sub nominations because of divergent Senate action ''' nomination = parse_nomination(nomination_id, body, options) output_nomination(nomination, options) return {'ok': True, 'saved': True}
def next_html(account_cookies, data, retry): logger.info('开始进行下一页 %s %s' % ( account_cookies.get('userName', ''), retry, )) global proxy if retry <= 0: return None cookie = account_cookies.get('cookie') headers = arouse_utils.get_get_headers() url = 'http://ehire.51job.com/Candidate/SearchResumeNew.aspx' logger.info(proxy) result = utils.download(url=url, data=data, proxy=proxy, cookie=cookie, headers=headers, method='post') if result['code'] != 0: logger.error("连接页面异常 ,重试: retry= %s" % retry) proxy = utils.get_proxy() return next_html(cookie, data, retry - 1) elif '用户数不够' in result['data'] or len(result['data']) < 1000: logger.error("代理异常,重试: retry= %s" % retry) proxy = utils.get_proxy() return next_html(cookie, data, retry - 1) if '您的操作过于频繁,请注意劳逸结合' in result['data']: return None return result['data']
def mirror_package(sitemap, package_name, lastmod, content_detail_url, options): """Create a local mirror of a FDSys package.""" # Return a list of files we downloaded. results = [] if not options.get("granules", False): # Most packages are just a package. This is the usual case. results = mirror_package_or_granule(sitemap, package_name, None, lastmod, options) else: # In some collections, like STATUTE, each document has subparts which are not # described in the sitemap. Load the main HTML page and scrape for the sub-files. # In the STATUTE collection, the MODS information in granules is redundant with # information in the top-level package MODS file. But the only way to get granule- # level PDFs is to go through the granules. content_index = utils.download(content_detail_url, "fdsys/package/%s/%s/%s.html" % (sitemap["year"], sitemap["collection"], package_name), utils.merge(options, { 'binary': True, })) if not content_index: raise Exception("Failed to download %s" % content_detail_url) for link in html.fromstring(content_index).cssselect("table.page-details-data-table td.rightLinkCell a"): if link.text == "More": m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href")) if not m or m.group(1) != package_name: raise Exception("Unmatched granule URL %s" % link.get("href")) granule_name = m.group(2) results = mirror_package_or_granule(sitemap, package_name, granule_name, lastmod, options) return results
def get_matching_pages(): # Does a Wikipedia API search for pages containing either of the # two templates. Returns the pages. page_titles = set() for template in ("CongLinks", "CongBio"): eicontinue = "" while True: # construct query URL, using the "eicontinue" of the last query to get the next batch url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template if eicontinue: url += "&eicontinue=" + eicontinue # load the XML print("Getting %s pages (%d...)" % (template, len(page_titles))) dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably for pgname in dom.xpath("query/embeddedin/ei/@title"): page_titles.add(pgname) # get the next eicontinue value and loop eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)") if not eicontinue: break return page_titles
def fetch(repo): """ Downloads a repository given the url of a zip and returns the contained modules. Automatically excludes modules that require an unsupported spec_version. The result is a dictionary of dictionaries with this structure: { identifier: { version: metadata } } """ logging.info("Fetching repo {}".format(repo["name"])) archive = ZipFile(StringIO(utils.download(repo["url"]))) result = dict() for entry in [name for name in archive.namelist() if name.endswith(".ckan")]: metadata = json.loads(archive.read(entry)) id, ver = metadata["identifier"], metadata["version"] if not metadata["spec_version"] in pykan.supported_spec_versions: logging.debug("Ignoring {} {} because of unsupported spec_version {}" .format(id, ver, metadata["spec_version"])) continue # result[identifier] = { version: metadata } if id in result: result[id] = {ver: metadata} else: result[id][ver] = metadata return result
def fetch_bioguide_page(bioguide, force): url = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=%s" % bioguide cache = "legislators/bioguide/%s.html" % bioguide try: body = download(url, cache, force) # Fix a problem? body = body.replace("Á\xc2\x81", "Á") # Entities like ’ are in Windows-1252 encoding. Normally lxml # handles that for us, but we're also parsing HTML. The lxml.html.HTMLParser # doesn't support specifying an encoding, and the lxml.etree.HTMLParser doesn't # provide a cssselect method on element objects. So we'll just decode ourselves. body = utils.unescape(body, "Windows-1252") dom = lxml.html.parse(io.StringIO(body)).getroot() except lxml.etree.XMLSyntaxError: raise Exception("Error parsing: " + url) # Sanity check. if len(dom.cssselect("title")) == 0: raise Exception("No page for bioguide %s!" % bioguide) return dom
def POST(self, category): i = web.input(olid=None, author=None, data=None, source_url=None, ip=None, _unicode=False) web.ctx.pop("_fieldstorage", None) web.ctx.pop("_data", None) def error(code__msg): (code, msg) = code__msg _cleanup() e = web.badrequest() e.data = simplejson.dumps({"code": code, "message": msg}) raise e source_url = i.source_url data = i.data if source_url: try: data = download(source_url) except: error(ERROR_INVALID_URL) if not data: error(ERROR_EMPTY) try: d = save_image(data, category=category, olid=i.olid, author=i.author, source_url=i.source_url, ip=i.ip) except ValueError: error(ERROR_BAD_IMAGE) _cleanup() return simplejson.dumps({"ok": "true", "id": d.id})
def run(options): # Download the TSV file. cache_zip_path = "adler-wilkerson-bills.zip" utils.download( "http://congressionalbills.org/billfiles/bills80-92.zip", cache_zip_path, utils.merge(options, {'binary': True, 'needs_content': False})) # Unzip in memory and process the records. zfile = zipfile.ZipFile(utils.cache_dir() + "/" + cache_zip_path) csvreader = csv.DictReader(zfile.open("bills80-92.txt"), delimiter="\t") for record in csvreader: rec = process_bill(record) import pprint pprint.pprint(rec)
def get_input(self): args = self.arguments if args.download: self.log.info("downloading") archive_name, url = self.downloadInfo() return open(download(args.data_dir, archive_name, url), 'r') return args.input
def vote_ids_for_senate(congress, session_year, options): session_num = int(session_year) - utils.get_congress_first_year(int(congress)) + 1 vote_ids = [] page = utils.download( "http://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_%s_%d.xml" % (congress, session_num), "%s/votes/%s/pages/senate.xml" % (congress, session_year), utils.merge(options, {'binary': True}) ) if not page: logging.error("Couldn't download Senate vote XML index, aborting") return None dom = etree.fromstring(page) # Sanity checks. if int(congress) != int(dom.xpath("congress")[0].text): logging.error("Senate vote XML returns the wrong Congress: %s" % dom.xpath("congress")[0].text) return None if int(session_year) != int(dom.xpath("congress_year")[0].text): logging.error("Senate vote XML returns the wrong session: %s" % dom.xpath("congress_year")[0].text) return None # Get vote list. for vote in dom.xpath("//vote"): num = int(vote.xpath("vote_number")[0].text) vote_id = "s" + str(num) + "-" + str(congress) + "." + session_year if not should_process(vote_id, options): continue vote_ids.append(vote_id) return vote_ids
def create_task_from_mysql(use_keyword='0'): logger = utils.get_logger() logger.info('start create task from mysql.') mysql_pool = PersistentDB( MySQLdb, host=common_settings.MYSQL_HOST, user=common_settings.MYSQL_USER, passwd=common_settings.MYSQL_PASSWD, db='spider', port=common_settings.MYSQL_PORT, charset='utf8' ) conn = mysql_pool.connection() cur = conn.cursor() # city_number = cur.execute('select code from city_entrence where source="ZHAO_PIN_GOU" and valid=1') # cities = cur.fetchall() function_number = cur.execute('select * from function_entrence where source="ZHAO_PIN_GOU" and valid=1') functions = cur.fetchall() # logger.info('the number of city and functions is:%s, %s' % (city_number, function_number)) # if not city_number or not function_number: # return logger.info('the number of functions is:%s, %s' % (len(city_order), function_number)) if not function_number: return add_task_url = common_settings.TASK_URL + common_settings.CREATE_TASK_PATH headers = {'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', } today = datetime.datetime.today() next_datetime = datetime.datetime(today.year, today.month, today.day, 0, 0, 0) + datetime.timedelta(days=1) deadline = int(time.mktime(time.strptime(next_datetime.strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S'))) * 1000 # use_keyword = '0' if datetime.datetime.now().hour<12 else '1' # city_result = [] # if cities: # cities = [i[0] for i in cities] # for i in city_order: # if i in cities: # city_result.append(i) # cities.remove(i) # city_result = city_result + cities random.shuffle(city_order) for city in city_order: for function in functions: add_task_data = { "callSystemID": "morgan-zhaopingou-resume-1", "source": 'ZHAO_PIN_GOU', "traceID": str(uuid.uuid1()), # "executeParam": json.loads(i.strip()), "executeParam": json.dumps( {"fenleiName": function[4], "pFenLeiName": function[1], "positionName": function[7], "hopeAdressStr": city, "fId": int(function[5]), "pFId": int(function[2]), "pId": int(function[8]), "id": int(function[11]), 'use_keyword': use_keyword}, ensure_ascii=False), "taskType": "RESUME_FETCH", "deadline": deadline } add_task_result = utils.download(url=add_task_url, is_json=True, headers=headers, method='post', data=add_task_data) logger.info('done.')
def fetch_version(bill_version_id, options): # Download MODS etc. logging.info("\n[%s] Fetching..." % bill_version_id) bill_type, number, congress, version_code = utils.split_bill_version_id( bill_version_id) # bill_id = "%s%s-%s" % (bill_type, number, congress) utils.download(mods_url_for(bill_version_id), document_filename_for(bill_version_id, "mods.xml"), utils.merge(options, { 'binary': True, 'to_cache': False })) return write_bill_version_metadata(bill_version_id)
def start_mission(self): if self.artist_name: query_result = multi_query_artist_engine(self.artist_name) if query_result: if utils.download(query_result, get_tmp_save_path(self.artist_name)): cleanup_cover(get_tmp_save_path(self.artist_name), get_cover_save_path(self.artist_name))
def download_file(url, dt_referencia, file_name): # verifica se o arquivo deve ser baixado if not utils.check_download(dt_referencia, file_name): return False dt_referencia = dt_referencia.strftime('%d/%m/%Y') params = { 'Titulo_1': 'quadro-resumo', 'Consulta_1': 'Ambos', 'Dt_Ref': dt_referencia, 'DataIni': dt_referencia, 'DataFim': dt_referencia, 'Indice': 'quadro-resumo', 'Consulta': 'Ambos', 'saida': 'csv', 'Idioma': 'PT' } utils.download(url, params, file_name)
def transformer_download(to_path=DEFAULT_TO_PATH, replace_existing=False) -> str: transformer_path = os.path.abspath( os.path.join(to_path, "distilbert-base-nli-mean-tokens")) if os.path.exists(transformer_path) and not replace_existing: print(f"already downloaded! {transformer_path}") return transformer_path transformer_zip = os.path.join(to_path, "distilbert-base-nli-mean-tokens.zip") download( "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/distilbert-base-nli-mean-tokens.zip", transformer_zip, ) with ZipFile(transformer_zip, "r") as z: z.extractall(transformer_path) os.remove(transformer_zip) return transformer_path
def baixar_titulos(): html = download('http://www.tce.pi.gov.br/') soup = bs(html, 'html.parser') news_div = soup.find('div', id='latestnews') lista_news = news_div.find_all('li', class_='latestnews') return lista_news
def scrape(options): hoy = datetime.datetime.now().strftime('%d%m%Y') fecha = options.get('fecha', hoy) integracion = options.get('integracion', 'D') tipoleg = options.get('tipoleg', 'Tit') orden = options.get('orden', 'Legislador') grafico = options.get('grafico', 's') query = "?Fecha=%s&Cuerpo=%s&Integracion=%s&TipoLeg=%s&Orden=%s&Grafico=%s" % ( fecha, cuerpo, integracion, tipoleg, orden, grafico) url_to_scrape = "http://www.parlamento.gub.uy/GxEmule/IntcpoGrafico.asp%s" % query logging.info( "Scrapeando informacion de diputados desde pagina del parlamento. \nURL: %s." % url_to_scrape) body = utils.download(url_to_scrape, 'legisladores/camara_%s_%s.html' % (cuerpo, hoy), options.get('force', False), options) doc = lxml.html.document_fromstring(body) tablas = doc.xpath("//table") rows = tablas[3].cssselect('tr td') diputados = [] for row in rows: mail_base = row.xpath("a[starts-with(@href, 'mailto')]/@href") if mail_base: email = mail_base[0].split(':', 1)[1] else: email = '' congress_people = { 'nombre': format_word( row.xpath('br/following-sibling::text()')[0].split(',')[1]), 'apellido': format_word( row.xpath('br/following-sibling::text()')[0].split(',')[0]), 'partido': format_word(row.xpath('br/following-sibling::text()')[1]), 'email': email, 'foto': base_url + row.xpath('img/@src')[0], 'departamento': format_word(row.xpath('br/following-sibling::text()')[2]) } diputados.append(congress_people) output_path = "data/diputados.json" utils.write( json.dumps(diputados, sort_keys=True, indent=2, default=utils.format_datetime, encoding="utf-8"), output_path)
def __init__(self,): sha1sum_id = "72cb19612318bb304d4a169804f525f88dc3f0d0" dataset = "petfinder" file_name = f"{dataset}_for_unit_tests.zip" url = get_repo_url() + file_name save_path = os.path.join(get_data_home_dir(), file_name) self._path = os.path.join(get_data_home_dir(), dataset) download( url=url, path=save_path, sha1_hash=sha1sum_id, ) protected_zip_extraction( save_path, sha1_hash=sha1sum_id, folder=self._path, ) self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0) self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0) for img_col in self.image_columns: self._train_df[img_col] = self._train_df[img_col].apply( lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images"))) self._test_df[img_col] =\ self._test_df[img_col].apply( lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images"))) print(self._train_df[img_col][0]) print(self._test_df[img_col][0]) _, self._train_df = train_test_split( self._train_df, test_size=0.1, random_state=np.random.RandomState(123), stratify=self._train_df[self.label_columns[0]], ) _, self._test_df = train_test_split( self._test_df, test_size=0.1, random_state=np.random.RandomState(123), stratify=self._test_df[self.label_columns[0]], ) self._train_df.reset_index(drop=True, inplace=True) self._test_df.reset_index(drop=True, inplace=True) print(f"train sample num: {len(self._train_df)}") print(f"test sample num: {len(self._test_df)}")
def prepare_ffmpeg(out_dir=None, version=None): ffmpeg_version = "4.1.3" output_dir = os.getcwd() + "/test/" ffmpeg_win32_dev_url = "https://ffmpeg.zeranoe.com/builds/win32/dev/ffmpeg-" + ffmpeg_version + "-win32-dev.zip" ffmpeg_win64_dev_url = "https://ffmpeg.zeranoe.com/builds/win64/dev/ffmpeg-" + ffmpeg_version + "-win64-dev.zip" ffmpeg_win32_shared_url = "https://ffmpeg.zeranoe.com/builds/win32/shared/ffmpeg-" + ffmpeg_version + "-win32-shared.zip" ffmpeg_win64_shared_url = "https://ffmpeg.zeranoe.com/builds/win64/shared/ffmpeg-" + ffmpeg_version + "-win64-shared.zip" pre_downloads = [ ffmpeg_win32_dev_url, ffmpeg_win64_dev_url, ffmpeg_win32_shared_url, ffmpeg_win64_shared_url ] wanted_downloads = {} for d in pre_downloads: wanted_downloads[d] = d[d.rfind('/')+1:] for url, filename in wanted_downloads.items(): if not Path(output_dir + filename).is_file() or os.stat(output_dir + filename).st_size == 0: print("Downloading " + url + " ...") download(url, output_dir + filename) print("Extract " + filename + " ...") ZipFile(output_dir + filename, 'r').extractall(output_dir) # Copy include headers shutil.copytree(output_dir + get_dirname_from_link(ffmpeg_win32_dev_url) + "/include", output_dir + "/include/ffmpeg") # Copy libraries shutil.copytree(output_dir + get_dirname_from_link(ffmpeg_win32_dev_url) + "/lib", output_dir + "/lib/ffmpeg/win32") shutil.copytree(output_dir + get_dirname_from_link(ffmpeg_win64_dev_url) + "/lib", output_dir + "/lib/ffmpeg/win64") # Copy dlls shutil.copytree(output_dir + get_dirname_from_link(ffmpeg_win32_shared_url) + "/bin", output_dir + "/bin/win32") shutil.copytree(output_dir + get_dirname_from_link(ffmpeg_win64_shared_url) + "/bin", output_dir + "/bin/win64") unnecessary_zip_files = [] unnecessary_dirs = [] for url, zip_file in wanted_downloads.items(): unnecessary_zip_files.append(output_dir + zip_file) unnecessary_dirs.append( output_dir + zip_file[:zip_file.rfind('.')]) remove_unnecessary_files_or_dirs( unnecessary_zip_files, unnecessary_dirs)
def get_sitemap(year, collection, lastmod, options): """Gets a single sitemap, downloading it if the sitemap has changed. Downloads the root sitemap (year==None, collection==None), or the sitemap for a year (collection==None), or the sitemap for a particular year and collection. Pass lastmod which is the current modification time of the file according to its parent sitemap, which is how it knows to return a cached copy. Returns the sitemap parsed into a DOM. """ # Construct the URL and the path to where to cache the file on disk. if year == None: url = "http://www.gpo.gov/smap/fdsys/sitemap.xml" path = "fdsys/sitemap/sitemap.xml" elif collection == None: url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year) path = "fdsys/sitemap/%s/sitemap.xml" % year else: url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection) path = "fdsys/sitemap/%s/%s.xml" % (year, collection) # Should we re-download the file? lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt") if options.get("cached", False): # If --cached is used, don't hit the network. force = False elif not lastmod: # No *current* lastmod date is known for this file (because it is the master # sitemap file, probably), so always download. force = True else: # If the file is out of date or --force is used, download the file. cache_lastmod = utils.read(lastmod_cache_file) force = (lastmod != cache_lastmod) or options.get("force", False) if force: logging.warn("Downloading: %s" % url) body = utils.download(url, path, utils.merge(options, { 'force': force, 'binary': True })) if not body: raise Exception("Failed to download %s" % url) # Write the current last modified date to disk so we know the next time whether # we need to fetch the file. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file) try: return etree.fromstring(body) except etree.XMLSyntaxError as e: raise Exception("XML syntax error in %s: %s" % (url, str(e)))
def search(self, q): results = {"movies": [], "people": []} q = urlparse.quote(q.encode('utf8')) url = self.baseURL + "/search/?q=" + q + "&p=.htm" soup = utils.download(url) soup = search.trim_search_page(soup) results["movies"] = search.movie_search(q, soup) results["people"] = search.people_search(soup) return results
def mirror_file(year, collection, package_name, lastmod, granule_name, file_types, options): # Where should we store the file? path = get_output_path(year, collection, package_name, granule_name, options) if not path: return # should skip # Do we need to update this record? lastmod_cache_file = path + "/lastmod.txt" cache_lastmod = utils.read(lastmod_cache_file) force = ((lastmod != cache_lastmod) or options.get( "force", False)) and not options.get("cached", False) # Try downloading files for each file type. targets = get_package_files(package_name, granule_name, path) for file_type in file_types: if file_type not in targets: raise Exception("Invalid file type: %s" % file_type) f_url, f_path = targets[file_type] if (not force) and os.path.exists(f_path): continue # we already have the current file logging.warn("Downloading: " + f_path) data = utils.download( f_url, f_path, utils.merge( options, { 'xml': True, 'force': force, 'to_cache': False, 'needs_content': file_type == "text" and f_path.endswith(".html"), })) if not data: if file_type == "pdf": # expected to be present for all packages raise Exception("Failed to download %s" % package_name) else: # not all packages have all file types, but assume this is OK logging.error("file not found: " + f_url) if file_type == "text" and f_path.endswith(".html"): # The "text" format files are put in an HTML container. Unwrap it into a .txt file. # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it? # html.fromstring does auto-detection. with open(f_path[0:-4] + "txt", "w") as f: text_content = unicode(html.fromstring(data).text_content()) f.write(text_content.encode("utf8")) # Write the current last modified date to disk so we know the next time whether # we need to fetch the files for this sitemap item. if lastmod and not options.get("cached", False): utils.write(lastmod, lastmod_cache_file)
def process_dent(self, nwo, ext, library_candidates) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]: # Process dependents (applications) to get function calls dents = [] edges = [] _, nwo = remap_nwo(nwo) if nwo is None: return dents, edges tmp_dir = download(nwo) files = walk(tmp_dir, ext) sha = None for f in files: context_and_calls = self.get_context_and_function_calls(f) if context_and_calls is None: continue if sha is None: sha = get_sha(tmp_dir, nwo) nwo, path, context, calls = context_and_calls libraries = [] for cxt in context: if type(cxt) == dict: libraries.extend([v.split('.')[0] for v in cxt.values()]) elif type(cxt) == list: libraries.extend(cxt) match_scopes = {} for cxt in set(libraries): if cxt in library_candidates: match_scopes[cxt] = library_candidates[cxt] for call in calls: for depended_library_name, dependend_library_functions in match_scopes.items(): for depended_library_function in dependend_library_functions: # Other potential filters: len(call['identifier']) > 6 or len(call['identifier'].split('_')) > 1 if (call['identifier'] not in self.language_parser.STOPWORDS and ((depended_library_function['identifier'].split('.')[-1] == '__init__' and call['identifier'] == depended_library_function['identifier'].split('.')[0]) or ((len(call['identifier']) > 9 or (not call['identifier'].startswith('_') and len(call['identifier'].split('_')) > 1)) and call['identifier'] == depended_library_function['identifier']) )): dent = { 'nwo': nwo, 'sha': sha, 'path': path, 'language': self.language, 'identifier': call['identifier'], 'argument_list': call['argument_list'], 'url': 'https://github.com/{}/blob/{}/{}#L{}-L{}'.format(nwo, sha, path, call['start_point'][0] + 1, call['end_point'][0] + 1) } dents.append(dent) edges.append((dent['url'], depended_library_function['url'])) return dents, edges
def download(self, basin): """Download IBTrACS data. """ self.logger.info('Downloading IBTrACS') utils.setup_signal_handler() utils.set_format_custom_text( self.CONFIG['ibtracs']['data_name_length']) # url = self.CONFIG['ibtracs']['urls']['since1980'] url = self.CONFIG['ibtracs']['urls'][basin] file = url.split('/')[-1] file = file[:-3].replace('.', '_') + '.nc' dir = self.CONFIG['ibtracs']['dirs'] os.makedirs(dir, exist_ok=True) self.ibtracs_file_path = f'{dir}{file}' utils.download(url, self.ibtracs_file_path, progress=True)
def main(url=JAPANESE_URL): content = utils.download(url) soup = BeautifulSoup(content) text_nodes = soup.find_all(name=["p", "td", "li"], text=True) texts = [] for t in [tn.text for tn in text_nodes]: texts.append([t]) utils.write_file(JAPANESE_TXT, texts)
def main(): RESOURCE_URL = "http://repositorio.dados.gov.br/saude/unidades-saude/unidade-basica-saude/ubs.csv.zip" OUTPUT_PATH = "C:/eclipse/saida.zip" EXTRACTED_PATH = "C:/eclipse/" if len(sys.argv) > 1: RESOURCE_URL = sys.argv[1] if len(sys.argv) > 2: OUTPUT_PATH = sys.argv[2] if len(sys.argv) > 3: EXTRACTED_PATH = sys.argv[3] response = request.urlopen(RESOURCE_URL) out_file = io.FileIO(OUTPUT_PATH, mode="w") content_length = response.getheader('Content-Length') if content_length: length = int(content_length) dw.download_length(response, out_file, length) else: dw.download(response, out_file) zfile = zipfile.ZipFile(OUTPUT_PATH) zfile.extractall(EXTRACTED_PATH) filename = [name for name in os.listdir(EXTRACTED_PATH) if '.csv' in name] dt = dw.loadlistfromcsv(EXTRACTED_PATH+filename[0]) for t in dt: print(t) print("Finished") #dic = dw.dicio(dt) columns_index = {'cod_munic': 2, 'cod_cnes': 3, 'nome_estab': 4, 'desc_endereco': 5} index = ('cod_munic', 'cod_cnes') dict = dw.create_index_from(dt, columns_index, index) for t in dict: print("{0} : {1} ".format(t, dict[t])) response.close() out_file.close() print("Finished")
def create_task_for_meituan(): logger = utils.get_logger() logger.info('start create task for meituan.') add_task_url = common_settings.TASK_URL + common_settings.CREATE_TASK_PATH headers = { 'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', } deadline = datetime.datetime.now() + datetime.timedelta(days=1) deadline = int(time.mktime(deadline.timetuple())) * 1000 # for func in [u'客服/技术支持', u'售前/售后服务', u'网络/在线客服', u'客服经理/主管', u'客户关系/投诉协调人员', u'客服咨询热线/呼叫中心人员', u'vip专员', u'售前/售后技术支持', u'其他客服/技术支持职位']: # add_task_data = { # "callSystemID": common_settings.CALLSYSTEMID, # "source": 'CH_HR', # "traceID": str(uuid.uuid1()), # # "executeParam": json.loads(i.strip()), # "executeParam": json.dumps({"zone": u'石家庄', "keyword": func, "degree": 0, "refreshTime": 1, "page_now": 1}, ensure_ascii=False), # "taskType": common_settings.TASK_TYPE, # "deadline": deadline # } # add_task_result = utils.download(url=add_task_url, is_json=True, headers=headers, method='post', data=add_task_data) for city in [u'石家庄', u'邢台', u'衡水', u'保定', u'沧州']: for function in [ u'行政', u'行政经理/主管/办公室主任', u'行政专员/助理', u'文员/文秘/秘书/助理', u'内勤/后勤/总务', u'前台/总机/接待', u'商务/行政司机', u'其他行政职位', u'客服/技术支持', u'售前/售后服务', u'网络/在线客服', u'客服经理/主管', u'客户关系/投诉协调人员', u'客服咨询热线/呼叫中心人员', u'vip专员', u'售前/售后技术支持', u'其他客服/技术支持职位' ]: add_task_data = { "callSystemID": common_settings.CALLSYSTEMID, "source": 'CH_HR', "traceID": str(uuid.uuid1()), # "executeParam": json.loads(i.strip()), "executeParam": json.dumps( { "zone": city, "keyword": function, "degree": 0, "refreshTime": 1, "page_now": 1 }, ensure_ascii=False), "taskType": common_settings.TASK_TYPE, "deadline": deadline } add_task_result = utils.download(url=add_task_url, is_json=True, headers=headers, method='post', data=add_task_data)
def crawl(path, pid=None): body = download("http://petitions.whitehouse.gov" + path, path.split('/')[2] + ".html") page = etree.parse(StringIO(body), parser) #catch page text whether or not petition is still active #http://stackoverflow.com/questions/5662404/how-can-i-select-an-element-with-multiple-classes-with-xpath text = "\n".join( page.xpath( "//div[contains(concat(' ',@class,' '),' petition-detail')]/p/text()" )) #check if expired if "The petition you are trying to access has expired" in text: return {"status": "expired"} #if raw_date not found, probably a bad link (or change in HTML, so we should be careful) try: raw_date = page.xpath("//div[@class='date']/text()")[0].strip() except: return {"status": "error", "reason": "no date"} created = datetime.strptime(raw_date, "%b %d, %Y").strftime("%Y-%m-%d") signatures = page.xpath("//div[@class='num-block num-block2']/text()") #indiciates possible response if len(signatures) == 0: signatures = page.xpath("//div[@class='num-block']/text()") response = page.xpath( "//div[contains(concat(' ',@class,' '),' petition-response')]") if response: status = "answered" else: return {"status": "error", "reason": "no signatures"} else: status = "active" signatures = int(signatures[0].replace(",", '')) if not pid: #no pid if fewer than 20 signatures try: pid = page.xpath( "//a[@class='load-next no-follow active']/@rel")[0] except: pid = "N/A" return { "id": pid, "status": status, "title": page.xpath("//h1[@class='title']/text()")[0].strip(), "body": text, "issues": page.xpath("//div[@class='issues']/a/text()"), "created": created, "visited": datetime.now().strftime("%Y-%m-%d-%H:%M:%S"), "signature_count": signatures, "url": "http://petitions.whitehouse.gov" + path }
def get_lrc(self, song, try_web=True): lrc_path = self.get_lrc_filepath(song) # user allocation lrc location_lrc = song.get("location_lrc", "") if location_lrc and os.path.exists(location_lrc): return location_lrc # lrc already exist if os.path.exists(lrc_path): if self.vaild_lrc(lrc_path): return lrc_path else: try: os.unlink(lrc_path) except: pass # search in current directory and same name file current_lrc_path = os.path.join(song.get_dir(), song.get_filename() + ".lrc") if os.path.exists(current_lrc_path) and self.vaild_lrc( current_lrc_path): return current_lrc_path # Search in local directory of the file if song.get("uri") != None and song.get_scheme() == "file": local_lrc = os.path.join(song.get_dir(), self.get_lrc_search_str(song)) if os.path.exists(local_lrc): return local_lrc if try_web and is_network_connected(): if song.get("lyric_url", None): ret = utils.download(song.get("lyric_url"), lrc_path) if ret: return lrc_path trust_a = song.get_str("artist") trust_t = song.get_str("title") filename = song.get_filename() if "-" in filename: untrust_a = filename.split("-")[0].strip() untrust_t = filename.split("-")[1].strip() else: untrust_a = song.get_str("artist") untrust_t = song.get_filename() trust_result = self.multiple_engine(song, lrc_path, trust_a, trust_t) if trust_result: return trust_result else: return self.multiple_engine(song, lrc_path, untrust_a, untrust_t) return None
def fetch_vote(vote_id, options): logging.info("\n[%s] Fetching..." % vote_id) vote_chamber, vote_number, vote_congress, vote_session_year = utils.split_vote_id(vote_id) if vote_chamber == "h": url = "http://clerk.house.gov/evs/%s/roll%03d.xml" % (vote_session_year, int(vote_number)) else: session_num = int(vote_session_year) - utils.get_congress_first_year(int(vote_congress)) + 1 url = "http://www.senate.gov/legislative/LIS/roll_call_votes/vote%d%d/vote_%d_%d_%05d.xml" % (int(vote_congress), session_num, int(vote_congress), session_num, int(vote_number)) # fetch vote XML page body = utils.download( url, "%s/votes/%s/%s%s/%s%s.xml" % (vote_congress, vote_session_year, vote_chamber, vote_number, vote_chamber, vote_number), utils.merge(options, {'binary': True}), ) if not body: return {'saved': False, 'ok': False, 'reason': "failed to download"} if options.get("download_only", False): return {'saved': False, 'ok': True, 'reason': "requested download only"} if "This vote was vacated" in body: # Vacated votes: 2011-484, 2012-327, ... # Remove file, since it may previously have existed with data. for f in (output_for_vote(vote_id, "json"), output_for_vote(vote_id, "xml")): if os.path.exists(f): os.unlink(f) return {'saved': False, 'ok': True, 'reason': "vote was vacated"} dom = etree.fromstring(body) vote = { 'vote_id': vote_id, 'chamber': vote_chamber, 'congress': int(vote_congress), 'session': vote_session_year, 'number': int(vote_number), 'updated_at': datetime.datetime.fromtimestamp(time.time()), 'source_url': url, } # do the heavy lifting if vote_chamber == "h": parse_house_vote(dom, vote) elif vote_chamber == "s": parse_senate_vote(dom, vote) # output and return output_vote(vote, options) return {'ok': True, 'saved': True}
def create_task_for_meituan(): logger = utils.get_logger() logger.info('start create task for meituan.') logger = utils.get_logger() logger.info('start create task from mysql.') mysql_pool = PersistentDB( MySQLdb, host=common_settings.MYSQL_HOST, user=common_settings.MYSQL_USER, passwd=common_settings.MYSQL_PASSWD, db=common_settings.MYSQL_DB, port=common_settings.MYSQL_PORT, charset='utf8' ) conn = mysql_pool.connection() cur = conn.cursor() function_number = cur.execute( 'select * from function_entrence where source="ZHAO_PIN_GOU" ' 'and valid=1 and thirdFunctionCode in ' '(262, 265, 261, 257, 256, 252, 253, 250, 254, 370, 372, 371, 369)') functions = cur.fetchall() logger.info('the number of functions is: %s' % (function_number)) if not function_number: return add_task_url = common_settings.TASK_URL + common_settings.CREATE_TASK_PATH headers = {'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', } deadline = datetime.datetime.now() + datetime.timedelta(days=1) deadline = int(time.mktime(deadline.timetuple())) * 1000 city_dict = { u'石家庄': '7', u'邢台': '11', u'衡水': '17', u'保定': '12', u'沧州': '15', u'扬州': '66', } for city in [u'石家庄', u'邢台', u'衡水', u'保定', u'沧州']: for function in functions: add_task_data = { "callSystemID": "morgan-zhaopingou-resume-1", "source": 'ZHAO_PIN_GOU', "traceID": str(uuid.uuid1()), # "executeParam": json.loads(i.strip()), "executeParam": json.dumps( {"fenleiName": function[4], "pFenLeiName": function[1], "positionName": function[7], "hopeAdressStr": city_dict[city], "fId": int(function[5]), "pFId": int(function[2]), "pId": int(function[8]), "id": int(function[11])}, ensure_ascii=False), "taskType": "RESUME_FETCH", "deadline": deadline } add_task_result = utils.download(url=add_task_url, is_json=True, headers=headers, method='post', data=add_task_data)
def __init__(self, input_img): # 下载文件 filename = utils.download(VGG_DOWNLOAD_LINK, VGG_FILENAME, EXPECTED_BYTES) # 加载文件 self.vgg_layers = scipy.io.loadmat(filename)["layers"] self.input_img = input_img # VGG在处理图像时候会将图片进行mean-center,所以我们首先要计算RGB三个channel上的mean self.mean_pixels = np.array([123.68, 116.779, 103.939]).reshape( (1, 1, 1, 3))
def run(options): # Download the TSV file. cache_zip_path = "adler-wilkerson-bills.zip" utils.download( "http://congressionalbills.org/billfiles/bills80-92.zip", cache_zip_path, utils.merge(options, { 'binary': True, 'needs_content': False })) # Unzip in memory and process the records. zfile = zipfile.ZipFile(utils.cache_dir() + "/" + cache_zip_path) csvreader = csv.DictReader(zfile.open("bills80-92.txt"), delimiter="\t") for record in csvreader: rec = process_bill(record) import pprint pprint.pprint(rec)
def get_feature(self, feature): """The unified API for getting specified features""" feature_path = os.path.join(self.location, feature + '.pkl') feature_present = os.path.exists(feature_path) if not feature_present: downloaded = download(self.dataset, feature, self.location) if not downloaded: return None # TODO: check MD5 values and etc. to ensure the downloaded dataset's intact with open(feature_path, 'rb') as fp: try: feature_values = load(fp) except: print "The previously downloaded dataset is compromised, downloading a new copy..." dowloaded = download(self.dataset, feature, self.location) if not downloaded: return None return feature_values
def integracion(href_comp, name, cuerpo, options): url = base_url + '/GxEmule/' + href_comp body = utils.download(url, 'comisiones/' + name + '.html', options.get('force', False), options) doc = lxml.html.document_fromstring(body) rows = doc.xpath("//div[contains(@style,'border:0px solid #006699')]/div" )[0].xpath("//div[contains(@style,'width:750px')]/div") divs = rows[0].cssselect('div') result = {} pre_res = [] lineas = 1 top = 0 start = False for div in divs: if (lineas == top): result[cat] = pre_res break elif div.text_content().strip() == 'Miembros': cat = 'miembros' start = True elif div.text_content().strip() == u'Secretaría': # agregamos los miembros cat = 'miembros' result[cat] = pre_res pre_res = [] cat = 'secretaria' elif div.text_content().strip() == 'Reuniones': # agregamos la secretaria cat = 'secretaria' result[cat] = pre_res cat = 'reuniones' pre_res = [] top = lineas + 2 elif start: #store data data = { 'text': div.text_content().strip(), 'tipo': cat, 'cuerpo': cuerpo, } pre_res.append(data) lineas += 1 email_exists = doc.xpath("//a[starts-with(@href, 'mailto')]/@href") if email_exists: email = email_exists[0].split(':', 1)[1] else: email = 'none' data = { 'correo': email, 'cuerpo': cuerpo, } result['email'] = data return result
def create_task_from_mysql(use_keyword='0'): logger = utils.get_logger() logger.info('start create task from mysql.') mysql_pool = PersistentDB( MySQLdb, host=common_settings.MYSQL_HOST, user=common_settings.MYSQL_USER, passwd=common_settings.MYSQL_PASSWD, db=common_settings.MYSQL_DOWNLOAD_DB, port=common_settings.MYSQL_PORT, charset='utf8' ) conn = mysql_pool.connection() cur = conn.cursor() city_number = cur.execute('select * from city_entrence where source="REN_CAI" and valid=1') cities = cur.fetchall() function_number = cur.execute('select * from function_entrence where source="REN_CAI" and valid=1') functions = cur.fetchall() logger.info('the number of city and functions is:%s, %s' % (city_number, function_number)) if not city_number or not function_number: return add_task_url = common_settings.TASK_URL +common_settings.CREATE_TASK_PATH headers = {'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',} today = datetime.datetime.today() next_datetime = datetime.datetime(today.year, today.month, today.day, 0, 0, 0) + datetime.timedelta(days=1) deadline = int(time.mktime(time.strptime(next_datetime.strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S'))) * 1000 city_result = [] # use_keyword = '0' if datetime.datetime.now().hour < 12 else '1' if cities: city_dict = {i[1]: i for i in cities} for i in city_order: if i in city_dict: city_result.append(city_dict[i]) city_dict.pop(i) city_result = city_result + city_dict.values() for city in city_result: for function in functions: add_task_data = { "callSystemID": 'morgan-rencaia-resume-1', "source": 'REN_CAI', "traceID": str(uuid.uuid1()), # "executeParam": json.loads(i.strip()), # "executeParam": json.dumps({'residence_ids': city[6], 'residence_name': city[1], "executeParam": json.dumps({ 'function_ids3': function[8], 'function_id_name': function[7], 'residence_ids': city[6], 'residence_name': city[1], 'use_keyword': use_keyword}, ensure_ascii=False), "taskType": "RESUME_FETCH", "deadline": deadline, } add_task_result = utils.download(url=add_task_url, is_json=True, headers=headers, method='post', data=add_task_data) logger.info('done.')