コード例 #1
0
def define_words(a_list):
    """
    :param a_list: a list of words
    :return: alist of semicolon separated information about each word
        - word, type of word, example usage
    """
    a_result = []
    with open("words_wiki_500.txt", "w") as out_file:
        for word in a_list:
            """stahni stranku z cambridge
               najdi jednotlive casti pomoci regexpu
               sloz vysledek
               pridej do resultu
            """
            clean_word = word.strip()
            logger.debug("word: %s" % clean_word)

            utils.download(get_page(clean_word), get_file_name(clean_word), logger)

            word_type = utils.get_info(
                get_file_name(clean_word),
                'span class="headword">.*?%s.*?span class="pos".*?>(.*?)<' % clean_word,
                logger,
            )
            out_line = "%s\t%s\n" % (clean_word, word_type)
            logger.debug(out_line)
            out_file.write(out_line)
    out_file.close()
コード例 #2
0
 def multiple_engine(self, song, lrc_path, artist, title):    
     try:
         ret = False
         result = TTPlayer().request(artist, title)
         if result:
             if config.getboolean("lyrics", "auto_download"):
                 ret = utils.download(result[0][2], lrc_path)
                 if ret and self.vaild_lrc(lrc_path):
                     return lrc_path
                 else:
                     os.unlink(lrc_path)
                     
         duomi_result = DUOMI().request(artist, title)
         if duomi_result:
             if config.getboolean("lyrics", "auto_download"):
                 ret = utils.download(duomi_result[0][2], lrc_path, "gbk")
                 if ret and self.vaild_lrc(lrc_path):
                     return lrc_path
                 else:
                     os.unlink(lrc_path)
                     
         soso_result =  SOSO().request(artist, title)
         if soso_result:
             if config.getboolean("lyrics", "auto_download"):
                 ret = utils.download(soso_result[0][2], lrc_path, "gb18030")
                 if ret and self.vaild_lrc(lrc_path):
                     return lrc_path
                 else:
                     os.unlink(lrc_path)
     except:
         return None
コード例 #3
0
def fetch_house_committee_meetings(committees, options):
    # Load any existing meetings file so we can recycle any GUIDs.
    existing_meetings = []
    output_file = output_for("house")
    if os.path.exists(output_file):
        existing_meetings = json.load(open(output_file))

    opts = dict(options)
    opts["binary"] = True
    opts["force"] = True

    meetings = []
    seen_meetings = set()

    # Scrape the committee listing page for a list of committees with scrapable events.
    committee_html = utils.download(
        "http://docs.house.gov/Committee/Committees.aspx", "committee_schedule/house_overview.html", options
    )
    for cmte in re.findall(r'<option value="(....)">', committee_html):
        if cmte not in committees:
            logging.error("Invalid committee code: " + cmte)
            continue

        # Download the feed for this committee.
        html = utils.download(
            "http://docs.house.gov/Committee/RSS.ashx?Code=%s" % cmte, "committee_schedule/house_%s.xml" % cmte, opts
        )

        # It's not really valid?
        html = html.replace(
            "&nbsp;", " "
        )  # who likes nbsp's? convert to spaces. but otherwise, entity is not recognized.
        # print html
        # Parse and loop through the meetings listed in the committee feed.
        dom = lxml.etree.fromstring(html)

        # original start to loop
        for mtg in dom.xpath("channel/item"):

            eventurl = unicode(mtg.xpath("string(link)"))
            event_id = re.search(r"EventID=(\d+)$", eventurl).group(1)
            pubDate = datetime.datetime.fromtimestamp(mktime(parsedate(mtg.xpath("string(pubDate)"))))
            # skip old records of meetings, some of which just give error pages
            if pubDate < (datetime.datetime.now() - datetime.timedelta(days=60)):
                continue

            # Events can appear in multiple committee feeds if it is a joint meeting.
            if event_id in seen_meetings:
                logging.info("Duplicated multi-committee event: " + event_id)
                continue
            seen_meetings.add(event_id)

            # this loads the xml from the page and sends the xml to parse_house_committee_meeting
            load_xml_from_page(eventurl, options, existing_meetings, committees, event_id, meetings)
            # if bad zipfile
            if load_xml_from_page == False:
                continue

    print "[house] Found %i meetings." % len(meetings)
    return meetings
コード例 #4
0
 def extract_content(self):
     classes = self.extract_classes()
     for klass in classes[1:]:  # Exclude ONLINE CLASS
         folder_name = remove_accents(klass['class'])
         create_folder(folder_name)
         print('Extracting Class: {0}'.format(klass['class']))
         self.browser.get('https://unipac-bomdespacho.blackboard.com{0}'.format(klass['href']))
         self.browser.find_element_by_id('header::0-whatsNewView::CO').click()  # Open content list
         block_class_contents = self.browser.find_element_by_id('block::0-whatsNewView::CO')
         class_contents = block_class_contents.find_elements_by_css_selector(
             "a[onclick*='nautilus_utils.actionSelected']"
         )
         i_content = 0
         for i_content in range(i_content, len(class_contents)):
             try:
                 block_classes_contents = self.browser.find_element_by_id('block::0-whatsNewView::CO')
                 class_contents = block_classes_contents.find_elements_by_css_selector(
                     "a[onclick*='nautilus_utils.actionSelected']"
                 )
                 class_contents[i_content].click()
                 self.check_visibility(By.CLASS_NAME, "individualContent-link")
                 file_link = self.browser.find_element_by_class_name('individualContent-link').get_attribute('href')
                 cookies = self.browser.get_cookies()
                 download(cookies, file_link, folder_name)
                 self.browser.back()
                 self.check_visibility(By.ID, "block::0-whatsNewView::CO")
             except TimeoutException:
                 print("Error in: {0} - {1}".format(klass['class'], klass['href']))
コード例 #5
0
ファイル: populate_db.py プロジェクト: techiev2/testGisc
def populate_events(ap_args):
    """Main function to populate the database with archive events.

    :Parameters:
        - ap_args : dict : Information related to archive(s).
    """
    FLOG.debug(ap_args)
    if __name__ != '__main__':
        ap_args = check_if_params(ap_args)
    check_arg_for_none_value(ap_args)
    CLOG.info('DB Populate args :- ' + str(ap_args))

    arch_path = ap_args.get('temp')
    del_arch_path = ap_args.get('delete_temp')
    if not arch_path:
        arch_path = gisc_msgs.TEMP_LOCATION
    arg = [k for k in ap_args if k is not 'temp'][0]

    if arg is 'url':
        empty_directory(arch_path)
        download(ap_args[arg], arch_path)
    elif arg is 'arch_date':
        empty_directory(arch_path)
        download(get_url_from_date(ap_args[arg]), arch_path)
    elif arg is 'src_dir':
        arch_path = ap_args[arg]
    elif arg is 'zip':
        extract_zip(ap_args[arg], arch_path)
    elif arg is 'files':
        empty_directory(arch_path)
        map(lambda x: handle_valid_invalid_files(x, arch_path), ap_args[arg])
    populate(arch_path)
    if arg is not 'src_dir' and del_arch_path:
        empty_directory(arch_path, False)
コード例 #6
0
def main():
    with tf.variable_scope('input') as scope:
        # use variable instead of placeholder because we're training the intial image to make it
        # look like both the content image and the style image
        input_image = tf.Variable(np.zeros([1, IMAGE_HEIGHT, IMAGE_WIDTH, 3]), dtype=tf.float32)
    
    utils.download(VGG_DOWNLOAD_LINK, VGG_MODEL, EXPECTED_BYTES)
    model = vgg_model.load_vgg(VGG_MODEL, input_image)
    model['global_step'] = tf.Variable(0, dtype=tf.int32, trainable=False, name='global_step')
    
    content_image = utils.get_resized_image(CONTENT_IMAGE, IMAGE_HEIGHT, IMAGE_WIDTH)
    content_image = content_image - MEAN_PIXELS
    style_image = utils.get_resized_image(STYLE_IMAGE, IMAGE_HEIGHT, IMAGE_WIDTH)
    style_image = style_image - MEAN_PIXELS

    model['content_loss'], model['style_loss'], model['total_loss'] = _create_losses(model, 
                                                    input_image, content_image, style_image)
    ###############################
    ## TO DO: create optimizer
    model['optimizer'] = tf.train.AdamOptimizer(LR).minimize(model['total_loss'], 
                                                            global_step=model['global_step'])
    ###############################
    model['summary_op'] = _create_summary(model)

    initial_image = utils.generate_noise_image(content_image, IMAGE_HEIGHT, IMAGE_WIDTH, NOISE_RATIO)
    train(model, input_image, initial_image)
コード例 #7
0
ファイル: actions.py プロジェクト: zuloo/gists
def get(gist_id, requested_file, destination_dir, facade):
    """ Download a gist file.

    Gists can have several files. This method searches for and downloads
    a single file from a gist.
    If the 'requested_file' is not informed, then it won't raise an error
    only if the gist have just a single file.

    :param gist_id: identifier of the gist to download
    :param requested_file: name of the Gist file to download
    :param destination_dir: destination directory after the download
    :param facade: instance of the object that actually perform the request
    """

    # Get the gist information
    response = facade.request_gist(gist_id)

    if response.ok:
        # Gist file found. Parse it into a 'model.Gist' class.
        gist_obj = model.Gist(get_json(response))
        list_names = [gistfile.filename for gistfile in gist_obj.files]

        if len(gist_obj.files) == 1 and not requested_file:
            # Download the only file in the gist
            gistfile = gist_obj.files[0]
            download(gistfile.raw_url, destination_dir,
                     gistfile.filename, gistfile.size)

            result = build_result(True, literals.DOWNLOAD_OK,
                                  gistfile.filename)
        else:
            # Gist have more than one file and filename not specified. Error
            if not requested_file:
                list_names = ", ".join(list_names)
                result = build_result(False, literals.DOWNLOAD_MORE_FILES,
                                      list_names)
            else:
                # Search for the Gist file
                gistfile = gist_obj.getFile(requested_file)
                if gistfile:

                    # Gist file found. Download it.
                    download(gistfile.raw_url, destination_dir,
                             gistfile.filename, gistfile.size)

                    result = build_result(True, literals.DOWNLOAD_OK,
                                          gistfile.filename)
                else:
                    # Requested file not found in Gist
                    list_of_names = ", ".join(list_names)
                    result = build_result(False, literals.FILE_NOT_FOUND,
                                          list_of_names)

    else:
        # Handle GitHub response error
        result = build_result(False, literals.DOWNLOAD_ERROR,
                              get_json(response)['message'])

    return result
コード例 #8
0
ファイル: tidy.py プロジェクト: mattorb/Ale
    def install(self, args=None):
        download('http://www.lacusveris.com/PythonTidy/PythonTidy-1.16.python', 'pythontidy.py')
        mkdir(finalTidyDir)
        shutil.move(join(alePath('tmp'), 'pythontidy.py'), finalTidyPath)
        os.system('chmod +x %s' % finalTidyPath)

        logging.info('Patching tidy to wrap at 120 columns instead of 80 ...')
        os.system('patch %s %s' % (finalTidyPath, join(alePath('recipes_all/tidy/'), 'tidy80col.patch')))
コード例 #9
0
def main(url=ZIP_CODE_URL):
    path = utils.DATASET_HOME + ADDRESS_ZIP
    utils.download(url, path)
    files = utils.unzip(path)
    if files and len(files) > 0:
        write_address(files[0])
    else:
        print("failed to download or unzip the file. please see at {0}.".format(utils.DATASET_HOME))
コード例 #10
0
ファイル: load_vgg.py プロジェクト: chenluyuan97/zhihu
 def __init__(self, input_img):
     # 下载文件
     utils.download(VGG_DOWNLOAD_LINK, VGG_FILENAME, EXPECTED_BYTES)
     # 加载文件
     self.vgg_layers = scipy.io.loadmat(VGG_FILENAME)["layers"]
     self.input_img = input_img
     # VGG在处理图像时候会将图片进行mean-center,所以我们首先要计算RGB三个channel上的mean
     self.mean_pixels = np.array([123.68, 116.779, 103.939]).reshape((1, 1, 1, 3))
コード例 #11
0
ファイル: manage.py プロジェクト: martinseener/seedBank
 def _download(self, src, dst_path):
     """download a file"""
     src_file = os.path.basename(src)
     dst = os.path.join(dst_path, src_file)
     if os.path.isfile(dst):
         logging.info('"%s" already exists, download skipped', dst)
     else:
         utils.make_dirs(dst_path)
         utils.download(src, dst)
コード例 #12
0
def download_report(report):
  report_path = "%s/%s/%s/report.%s" % (report['inspector'], report['year'], report['report_id'], report['file_type'])
  binary = (report['file_type'] == 'pdf')

  utils.download(
    report['url'],
    "%s/%s" % (utils.data_dir(), report_path),
    {'binary': binary}
  )
  return report_path
コード例 #13
0
 def do_snapshot_download(self, args):
     '''Download a SNAPSHOT'''
     snapshot = args.snapshot
     body = self.client.snapshot_download(snapshot=snapshot)
     result = utils.loads(body)
     result.pop()
     if len(result) == 0:
         print('Snapshot %s does not exist.' % snapshot)
         return -1
     uri=result.pop()
     utils.download(uri)
コード例 #14
0
    def multiple_engine(self, song, lrc_path, artist, title):    
        try:
            ret = False
            
            ting_result = poster.query_lrc_info(artist, title)
            if ting_result:
                urls = [item[2] for item in ting_result]
                for url in urls:
                    ret = utils.download(url, lrc_path)
                    if ret:
                        return lrc_path
            
            result = TTPlayer().request(artist, title)

            if result:
                urls = [item[2] for item in result]                
                for url in urls:
                    ret = utils.download(url, lrc_path)
                    if ret and self.vaild_lrc(lrc_path):
                        return lrc_path
                    
            ttpod_result = TTPod().request_data(artist, title)        
            if ttpod_result:
                with open(lrc_path, 'wb') as fp:
                    fp.write(ttpod_result)
                    return lrc_path
                        
            duomi_result = DUOMI().request(artist, title)
            if duomi_result:
                urls = [item[2] for item in duomi_result]                
                for url in urls:
                    ret = utils.download(url, lrc_path, "gbk")
                    if ret and self.vaild_lrc(lrc_path):
                        return lrc_path
                        
            soso_result =  SOSO().request(artist, title)
            if soso_result:
                urls = [item[2] for item in soso_result]                
                for url in urls:
                    ret = utils.download(url, lrc_path, "gb18030")
                    if ret and self.vaild_lrc(lrc_path):
                        return lrc_path
            try:    
                os.unlink(lrc_path)
            except:    
                pass
                
            return None
                    
        except Exception, e:
            print e
            return None
コード例 #15
0
ファイル: source.py プロジェクト: rhs2132/conda
def download_to_cache(meta):
    if not isdir(SRC_CACHE):
        os.makedirs(SRC_CACHE)

    fn = meta['fn']
    md5 = meta.get('md5')
    path = join(SRC_CACHE, fn)
    if not isfile(path):
        download(meta['url'], path, md5)

    if md5 and not md5_file(path) == md5:
        raise Exception("MD5 mismatch: %r" % meta)
    return path
コード例 #16
0
ファイル: manage.py プロジェクト: martinseener/seedBank
 def iso(self, name):
     """download ISOs"""
     dst = os.path.join(self.cfg['paths']['isos'], name + '.iso')
     if os.path.isfile(dst):
         logging.info('nothing to do, "%s" already exists', dst)
         return
     distribution = name.split('-', 1)[0]
     if distribution == 'ubuntu':
         url = self.iso_ubuntu(name)
     elif distribution == 'debian':
         url = self.iso_debian(name)
     utils.make_dirs(self.cfg['paths']['isos'])
     utils.download(url, dst)
コード例 #17
0
def vote_ids_for_house(congress, session_year, options):
    vote_ids = []

    index_page = "http://clerk.house.gov/evs/%s/index.asp" % session_year
    group_page = r"ROLL_(\d+)\.asp"
    link_pattern = r"http://clerk.house.gov/cgi-bin/vote.asp\?year=%s&rollnumber=(\d+)" % session_year

    # download index page, find the matching links to the paged listing of votes
    page = utils.download(
        index_page,
        "%s/votes/%s/pages/house.html" % (congress, session_year),
        options)

    if not page:
        logging.error("Couldn't download House vote index page, aborting")
        return None

    # extract matching links
    doc = html.document_fromstring(page)
    links = doc.xpath(
        "//a[re:match(@href, '%s')]" % group_page,
        namespaces={"re": "http://exslt.org/regular-expressions"})

    for link in links:
        # get some identifier for this inside page for caching
        grp = re.match(group_page, link.get("href")).group(1)

        # download inside page, find the matching links
        page = utils.download(
            urlparse.urljoin(index_page, link.get("href")),
            "%s/votes/%s/pages/house_%s.html" % (congress, session_year, grp),
            options)

        if not page:
            logging.error("Couldn't download House vote group page (%s), aborting" % grp)
            continue

        doc = html.document_fromstring(page)
        votelinks = doc.xpath(
            "//a[re:match(@href, '%s')]" % link_pattern,
            namespaces={"re": "http://exslt.org/regular-expressions"})

        for votelink in votelinks:
            num = re.match(link_pattern, votelink.get("href")).group(1)
            vote_id = "h" + num + "-" + str(congress) + "." + session_year
            if not should_process(vote_id, options):
                continue
            vote_ids.append(vote_id)

    return utils.uniq(vote_ids)
コード例 #18
0
ファイル: liftover.py プロジェクト: sp00nman/hubward
def download_chainfile(source_assembly, target_assembly):
    """
    Download if needed, putting in the cache_dir.

    If the environmental variable HUBWARD_CACHE_DIR does not exist, then use
    ~/.hubward_cache
    """
    cache_dir = os.environ.get(
        'HUBWARD_CACHE_DIR', os.path.expanduser('~/.hubward_cache'))
    utils.makedirs(cache_dir)
    url = chainfile_url(source_assembly, target_assembly)
    dest = os.path.join(cache_dir, os.path.basename(url))
    utils.download(url, dest)
    return dest
コード例 #19
0
ファイル: bill_versions.py プロジェクト: GPHemsley/congress
def fetch_version(bill_version_id, options):
  # Download MODS etc.
	
  logging.info("\n[%s] Fetching..." % bill_version_id)
  
  bill_type, number, congress, version_code = utils.split_bill_version_id(bill_version_id)
  # bill_id = "%s%s-%s" % (bill_type, number, congress)

  utils.download(
    mods_url_for(bill_version_id), 
    document_filename_for(bill_version_id, "mods.xml"),
    utils.merge(options, {'binary': True, 'to_cache': False})
  )
  
  return write_bill_version_metadata(bill_version_id)
コード例 #20
0
ファイル: code.py プロジェクト: internetarchive/openlibrary
    def POST(self, category):
        i = web.input('olid', author=None, file={}, source_url=None, success_url=None, failure_url=None)

        success_url = i.success_url or web.ctx.get('HTTP_REFERRER') or '/'
        failure_url = i.failure_url or web.ctx.get('HTTP_REFERRER') or '/'

        def error(code__msg):
            (code, msg) = code__msg
            print("ERROR: upload failed, ", i.olid, code, repr(msg), file=web.debug)
            _cleanup()
            url = changequery(failure_url, errcode=code, errmsg=msg)
            raise web.seeother(url)

        if i.source_url:
            try:
                data = download(i.source_url)
            except:
                error(ERROR_INVALID_URL)
            source_url = i.source_url
        elif i.file is not None and i.file != {}:
            data = i.file.value
            source_url = None
        else:
            error(ERROR_EMPTY)

        if not data:
            error(ERROR_EMPTY)

        try:
            save_image(data, category=category, olid=i.olid, author=i.author, source_url=i.source_url, ip=web.ctx.ip)
        except ValueError:
            error(ERROR_BAD_IMAGE)

        _cleanup()
        raise web.seeother(success_url)
コード例 #21
0
def go_to_list_html(param, cookie, viewstate, retry):
    global proxy
    logger.info('搜索前置页面-开始初始搜索------ %s' % retry)
    if retry <= 0:
        return None
    url = 'http://ehire.51job.com/Candidate/SearchResumeNew.aspx'
    # param = {"function_code": "0107", "functionName": "软件工程师", "region_code": "010000",
    #          "regionName": "北京"}
    data = arouse_utils.get_frist_post_headers(viewstate, param=param)

    logger.info(proxy)
    result = utils.download(url=url,
                            data=data,
                            proxy=proxy,
                            cookie=cookie,
                            method='post')
    if result['code'] != 0:
        logger.error("连接页面异常 ,重试: retry= %s" % retry)
        proxy = utils.get_proxy()
        return go_to_list_html(param, cookie, viewstate, retry - 1)
    elif '用户数不够' in result['data'] or len(result['data']) < 1000:
        logger.error("代理异常,重试: retry= %s" % retry)
        proxy = utils.get_proxy()
        return go_to_list_html(cookie, viewstate, retry - 1)
    if '您的操作过于频繁,请注意劳逸结合' in result['data']:
        return None
    return result['data']
コード例 #22
0
def go_to_search_html(cookie, retry):
    global proxy
    if retry <= 0:
        return None
    logger.info('跳转搜索前置页面中------%s ' % retry)
    url = 'http://ehire.51job.com/Candidate/SearchResumeIndexNew.aspx'
    headers = arouse_utils.get_get_headers(
        'http://ehire.51job.com/Navigate.aspx')
    if not proxy:
        proxy = utils.get_proxy()
    logger.info(proxy)
    utils_download = utils.download(url=url,
                                    headers=headers,
                                    proxy=proxy,
                                    cookie=cookie)

    if utils_download['code'] != 0:
        logger.error('搜索页面出错:%s %s' % (url, retry))
        if utils_download.get(
                'data'
        ) and '<a href="/MainLogin.aspx?returl=' in utils_download['data']:
            return 'login'
        proxy = utils.get_proxy()
        return go_to_search_html(cookie, retry - 1)

    if '<a href="/MainLogin.aspx?returl=' in utils_download['data']:
        return 'login'
    viewstate = arouse_utils.find(
        '<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="(.*?)" />',
        utils_download['data'])
    if not viewstate:
        proxy = utils.get_proxy()
        go_to_search_html(cookie, retry - 1)
    return viewstate
コード例 #23
0
ファイル: process.py プロジェクト: logonmy/Spider-1
def download_page(url=None,
                  method=None,
                  header=None,
                  proxy=None,
                  data=None,
                  session=None,
                  need_session=False):
    logger = utils.get_logger()
    if not proxy:
        proxy = local_proxy()
    for x in xrange(0, 5):
        result = utils.download(url=url,
                                headers=header,
                                data=data,
                                method=method,
                                need_session=need_session,
                                session=session,
                                proxy=proxy,
                                allow_redirects=True,
                                retry_time=1)
        if result['code'] == 0:
            logger.info('success when download %s ' % url)
            break
        else:
            proxy = local_proxy()
            result['code'] = 500
        time.sleep(1)
    result['proxy'] = proxy
    return result
コード例 #24
0
  def candidate_for(bioguide):
    url = current_bioguide[bioguide]["terms"][-1].get("url", None)
    if not url:
      if debug:
        print "[%s] No official website, skipping" % bioguide
      return None

    if debug:
      print "[%s] Downloading..." % bioguide
    cache = "congress/%s.html" % bioguide
    body = utils.download(url, cache, force)

    all_matches = []
    for regex in regexes[service]:
      matches = re.findall(regex, body, re.I)
      if matches:
        all_matches.extend(matches)

    if all_matches:
      for candidate in all_matches:
        passed = True
        for blacked in blacklist[service]:
          if re.search(blacked, candidate, re.I):
            passed = False
        
        if not passed:
          if debug:
            print "\tBlacklisted: %s" % candidate
          continue

        return candidate
      return None
コード例 #25
0
ファイル: nomination_info.py プロジェクト: TTREN/congress
def fetch_nomination(nomination_id, options={}):
    logging.info("\n[%s] Fetching..." % nomination_id)

    # fetch committee name map, if it doesn't already exist
    nomination_type, number, congress = utils.split_nomination_id(nomination_id)
    if not number:
        return {'saved': False, 'ok': False, 'reason': "Couldn't parse %s" % nomination_id}

    if not utils.committee_names:
        utils.fetch_committee_names(congress, options)

    # fetch bill details body
    body = utils.download(
        nomination_url_for(nomination_id),
        nomination_cache_for(nomination_id, "information.html"), options)

    if not body:
        return {'saved': False, 'ok': False, 'reason': "failed to download"}

    if options.get("download_only", False):
        return {'saved': False, 'ok': True, 'reason': "requested download only"}

    '''
  # TO DO
  ## detect group nominations, particularly for military promotions
  ## detect when a group nomination is split into sub nominations because of divergent Senate action
  '''

    nomination = parse_nomination(nomination_id, body, options)
    output_nomination(nomination, options)
    return {'ok': True, 'saved': True}
コード例 #26
0
def next_html(account_cookies, data, retry):
    logger.info('开始进行下一页 %s %s' % (
        account_cookies.get('userName', ''),
        retry,
    ))
    global proxy
    if retry <= 0:
        return None
    cookie = account_cookies.get('cookie')
    headers = arouse_utils.get_get_headers()
    url = 'http://ehire.51job.com/Candidate/SearchResumeNew.aspx'
    logger.info(proxy)

    result = utils.download(url=url,
                            data=data,
                            proxy=proxy,
                            cookie=cookie,
                            headers=headers,
                            method='post')

    if result['code'] != 0:
        logger.error("连接页面异常 ,重试: retry= %s" % retry)
        proxy = utils.get_proxy()
        return next_html(cookie, data, retry - 1)
    elif '用户数不够' in result['data'] or len(result['data']) < 1000:
        logger.error("代理异常,重试: retry= %s" % retry)
        proxy = utils.get_proxy()
        return next_html(cookie, data, retry - 1)
    if '您的操作过于频繁,请注意劳逸结合' in result['data']:
        return None
    return result['data']
コード例 #27
0
ファイル: fdsys.py プロジェクト: hugovk/congress
def mirror_package(sitemap, package_name, lastmod, content_detail_url, options):
    """Create a local mirror of a FDSys package."""

    # Return a list of files we downloaded.
    results = []

    if not options.get("granules", False):
        # Most packages are just a package. This is the usual case.
        results = mirror_package_or_granule(sitemap, package_name, None, lastmod, options)

    else:
        # In some collections, like STATUTE, each document has subparts which are not
        # described in the sitemap. Load the main HTML page and scrape for the sub-files.
        # In the STATUTE collection, the MODS information in granules is redundant with
        # information in the top-level package MODS file. But the only way to get granule-
        # level PDFs is to go through the granules.
        content_index = utils.download(content_detail_url,
                                       "fdsys/package/%s/%s/%s.html" % (sitemap["year"], sitemap["collection"], package_name),
                                       utils.merge(options, {
                                           'binary': True,
                                       }))
        if not content_index:
            raise Exception("Failed to download %s" % content_detail_url)
        for link in html.fromstring(content_index).cssselect("table.page-details-data-table td.rightLinkCell a"):
            if link.text == "More":
                m = re.match("granule/(.*)/(.*)/content-detail.html", link.get("href"))
                if not m or m.group(1) != package_name:
                    raise Exception("Unmatched granule URL %s" % link.get("href"))
                granule_name = m.group(2)
                results = mirror_package_or_granule(sitemap, package_name, granule_name, lastmod, options)

    return results
コード例 #28
0
	def get_matching_pages():
		# Does a Wikipedia API search for pages containing either of the
		# two templates. Returns the pages.

		page_titles = set()

		for template in ("CongLinks", "CongBio"):
			eicontinue = ""
			while True:
				# construct query URL, using the "eicontinue" of the last query to get the next batch
				url = 'http://en.wikipedia.org/w/api.php?action=query&list=embeddedin&eititle=Template:%s&eilimit=500&format=xml' % template
				if eicontinue: url += "&eicontinue=" + eicontinue

				# load the XML
				print("Getting %s pages (%d...)" % (template, len(page_titles)))
				dom = lxml.etree.fromstring(utils.download(url, None, True)) # can't cache eicontinue probably

				for pgname in dom.xpath("query/embeddedin/ei/@title"):
					page_titles.add(pgname)

				# get the next eicontinue value and loop
				eicontinue = dom.xpath("string(query-continue/embeddedin/@eicontinue)")
				if not eicontinue: break

		return page_titles
コード例 #29
0
ファイル: repo.py プロジェクト: Ippo343/pykan
def fetch(repo):
    """
    Downloads a repository given the url of a zip and returns the contained modules.
    Automatically excludes modules that require an unsupported spec_version.

    The result is a dictionary of dictionaries with this structure:
    { identifier: { version: metadata } }
    """

    logging.info("Fetching repo {}".format(repo["name"]))

    archive = ZipFile(StringIO(utils.download(repo["url"])))

    result = dict()
    for entry in [name for name in archive.namelist() if name.endswith(".ckan")]:
    
        metadata = json.loads(archive.read(entry))
        id, ver = metadata["identifier"], metadata["version"]

        if not metadata["spec_version"] in pykan.supported_spec_versions:
            logging.debug("Ignoring {} {} because of unsupported spec_version {}"
                          .format(id, ver, metadata["spec_version"]))
            continue

        # result[identifier] = { version: metadata } 
        if id in result:
            result[id] = {ver: metadata}
        else:
            result[id][ver] = metadata

    return result
コード例 #30
0
def fetch_bioguide_page(bioguide, force):
  url = "http://bioguide.congress.gov/scripts/biodisplay.pl?index=%s" % bioguide
  cache = "legislators/bioguide/%s.html" % bioguide
  try:
    body = download(url, cache, force)

    # Fix a problem?
    body = body.replace("&Aacute;\xc2\x81", "&Aacute;")

    # Entities like &#146; are in Windows-1252 encoding. Normally lxml
    # handles that for us, but we're also parsing HTML. The lxml.html.HTMLParser
    # doesn't support specifying an encoding, and the lxml.etree.HTMLParser doesn't
    # provide a cssselect method on element objects. So we'll just decode ourselves.
    body = utils.unescape(body, "Windows-1252")

    dom = lxml.html.parse(io.StringIO(body)).getroot()
  except lxml.etree.XMLSyntaxError:
    raise Exception("Error parsing: " + url)

  # Sanity check.

  if len(dom.cssselect("title")) == 0:
    raise Exception("No page for bioguide %s!" % bioguide)

  return dom
コード例 #31
0
ファイル: code.py プロジェクト: internetarchive/openlibrary
    def POST(self, category):
        i = web.input(olid=None, author=None, data=None, source_url=None, ip=None, _unicode=False)

        web.ctx.pop("_fieldstorage", None)
        web.ctx.pop("_data", None)

        def error(code__msg):
            (code, msg) = code__msg
            _cleanup()
            e = web.badrequest()
            e.data = simplejson.dumps({"code": code, "message": msg})
            raise e

        source_url = i.source_url
        data = i.data

        if source_url:
            try:
                data = download(source_url)
            except:
                error(ERROR_INVALID_URL)

        if not data:
            error(ERROR_EMPTY)

        try:
            d = save_image(data, category=category, olid=i.olid, author=i.author, source_url=i.source_url, ip=i.ip)
        except ValueError:
            error(ERROR_BAD_IMAGE)

        _cleanup()
        return simplejson.dumps({"ok": "true", "id": d.id})
コード例 #32
0
def run(options):
    # Download the TSV file.
    cache_zip_path = "adler-wilkerson-bills.zip"
    utils.download(
        "http://congressionalbills.org/billfiles/bills80-92.zip",
        cache_zip_path,
        utils.merge(options, {'binary': True, 'needs_content': False}))

    # Unzip in memory and process the records.
    zfile = zipfile.ZipFile(utils.cache_dir() + "/" + cache_zip_path)
    csvreader = csv.DictReader(zfile.open("bills80-92.txt"), delimiter="\t")
    for record in csvreader:
        rec = process_bill(record)

        import pprint
        pprint.pprint(rec)
コード例 #33
0
ファイル: injector.py プロジェクト: GlearDev/nuxeo
 def get_input(self):
     args = self.arguments
     if args.download:
         self.log.info("downloading")
         archive_name, url = self.downloadInfo()
         return open(download(args.data_dir, archive_name, url), 'r')
     return args.input
コード例 #34
0
def vote_ids_for_senate(congress, session_year, options):
    session_num = int(session_year) - utils.get_congress_first_year(int(congress)) + 1

    vote_ids = []

    page = utils.download(
        "http://www.senate.gov/legislative/LIS/roll_call_lists/vote_menu_%s_%d.xml" % (congress, session_num),
        "%s/votes/%s/pages/senate.xml" % (congress, session_year),
        utils.merge(options, {'binary': True})
    )

    if not page:
        logging.error("Couldn't download Senate vote XML index, aborting")
        return None

    dom = etree.fromstring(page)

    # Sanity checks.
    if int(congress) != int(dom.xpath("congress")[0].text):
        logging.error("Senate vote XML returns the wrong Congress: %s" % dom.xpath("congress")[0].text)
        return None
    if int(session_year) != int(dom.xpath("congress_year")[0].text):
        logging.error("Senate vote XML returns the wrong session: %s" % dom.xpath("congress_year")[0].text)
        return None

    # Get vote list.
    for vote in dom.xpath("//vote"):
        num = int(vote.xpath("vote_number")[0].text)
        vote_id = "s" + str(num) + "-" + str(congress) + "." + session_year
        if not should_process(vote_id, options):
            continue
        vote_ids.append(vote_id)
    return vote_ids
コード例 #35
0
ファイル: create_task.py プロジェクト: logonmy/Spider-1
def create_task_from_mysql(use_keyword='0'):
    logger = utils.get_logger()
    logger.info('start create task from mysql.')
    mysql_pool = PersistentDB(
        MySQLdb,
        host=common_settings.MYSQL_HOST,
        user=common_settings.MYSQL_USER,
        passwd=common_settings.MYSQL_PASSWD,
        db='spider',
        port=common_settings.MYSQL_PORT,
        charset='utf8'
    )
    conn = mysql_pool.connection()
    cur = conn.cursor()
    # city_number = cur.execute('select code from city_entrence where source="ZHAO_PIN_GOU" and valid=1')
    # cities = cur.fetchall()
    function_number = cur.execute('select * from function_entrence where source="ZHAO_PIN_GOU" and valid=1')
    functions = cur.fetchall()
    # logger.info('the number of city and functions is:%s, %s' % (city_number, function_number))
    # if not city_number or not function_number:
    #     return

    logger.info('the number of  functions is:%s, %s' % (len(city_order), function_number))
    if not function_number:
        return

    add_task_url = common_settings.TASK_URL + common_settings.CREATE_TASK_PATH
    headers = {'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', }
    today = datetime.datetime.today()
    next_datetime = datetime.datetime(today.year, today.month, today.day, 0, 0, 0) + datetime.timedelta(days=1)
    deadline = int(time.mktime(time.strptime(next_datetime.strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S'))) * 1000
    # use_keyword = '0' if datetime.datetime.now().hour<12 else '1'

    # city_result = []
    # if cities:
    #     cities = [i[0] for i in cities]
    #     for i in city_order:
    #         if i in cities:
    #             city_result.append(i)
    #             cities.remove(i)
    #     city_result = city_result + cities
    random.shuffle(city_order)
    for city in city_order:
        for function in functions:
            add_task_data = {
                "callSystemID": "morgan-zhaopingou-resume-1",
                "source": 'ZHAO_PIN_GOU',
                "traceID": str(uuid.uuid1()),
                # "executeParam": json.loads(i.strip()), 
                "executeParam": json.dumps(
                    {"fenleiName": function[4], "pFenLeiName": function[1], "positionName": function[7],
                     "hopeAdressStr": city, "fId": int(function[5]), "pFId": int(function[2]), "pId": int(function[8]),
                     "id": int(function[11]), 'use_keyword': use_keyword}, ensure_ascii=False),
                "taskType": "RESUME_FETCH",
                "deadline": deadline
            }
            add_task_result = utils.download(url=add_task_url, is_json=True, headers=headers, method='post',
                                             data=add_task_data)
    logger.info('done.')
コード例 #36
0
def fetch_version(bill_version_id, options):
    # Download MODS etc.

    logging.info("\n[%s] Fetching..." % bill_version_id)

    bill_type, number, congress, version_code = utils.split_bill_version_id(
        bill_version_id)
    # bill_id = "%s%s-%s" % (bill_type, number, congress)

    utils.download(mods_url_for(bill_version_id),
                   document_filename_for(bill_version_id, "mods.xml"),
                   utils.merge(options, {
                       'binary': True,
                       'to_cache': False
                   }))

    return write_bill_version_metadata(bill_version_id)
コード例 #37
0
 def start_mission(self):
     if self.artist_name:
         query_result = multi_query_artist_engine(self.artist_name)
         if query_result:
             if utils.download(query_result,
                               get_tmp_save_path(self.artist_name)):
                 cleanup_cover(get_tmp_save_path(self.artist_name),
                               get_cover_save_path(self.artist_name))
コード例 #38
0
def download_file(url, dt_referencia, file_name):
    # verifica se o arquivo deve ser baixado
    if not utils.check_download(dt_referencia, file_name):
        return False
    dt_referencia = dt_referencia.strftime('%d/%m/%Y')
    params = {
        'Titulo_1': 'quadro-resumo',
        'Consulta_1': 'Ambos',
        'Dt_Ref': dt_referencia,
        'DataIni': dt_referencia,
        'DataFim': dt_referencia,
        'Indice': 'quadro-resumo',
        'Consulta': 'Ambos',
        'saida': 'csv',
        'Idioma': 'PT'
    }
    utils.download(url, params, file_name)
コード例 #39
0
def transformer_download(to_path=DEFAULT_TO_PATH,
                         replace_existing=False) -> str:
    transformer_path = os.path.abspath(
        os.path.join(to_path, "distilbert-base-nli-mean-tokens"))
    if os.path.exists(transformer_path) and not replace_existing:
        print(f"already downloaded! {transformer_path}")
        return transformer_path
    transformer_zip = os.path.join(to_path,
                                   "distilbert-base-nli-mean-tokens.zip")
    download(
        "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/distilbert-base-nli-mean-tokens.zip",
        transformer_zip,
    )
    with ZipFile(transformer_zip, "r") as z:
        z.extractall(transformer_path)
    os.remove(transformer_zip)
    return transformer_path
コード例 #40
0
ファイル: questao1.py プロジェクト: fabiomsrs/ml_class
def baixar_titulos():
    html = download('http://www.tce.pi.gov.br/')
    soup = bs(html, 'html.parser')

    news_div = soup.find('div', id='latestnews')
    lista_news = news_div.find_all('li', class_='latestnews')

    return lista_news
コード例 #41
0
def scrape(options):
    hoy = datetime.datetime.now().strftime('%d%m%Y')
    fecha = options.get('fecha', hoy)
    integracion = options.get('integracion', 'D')
    tipoleg = options.get('tipoleg', 'Tit')
    orden = options.get('orden', 'Legislador')
    grafico = options.get('grafico', 's')

    query = "?Fecha=%s&Cuerpo=%s&Integracion=%s&TipoLeg=%s&Orden=%s&Grafico=%s" % (
        fecha, cuerpo, integracion, tipoleg, orden, grafico)
    url_to_scrape = "http://www.parlamento.gub.uy/GxEmule/IntcpoGrafico.asp%s" % query

    logging.info(
        "Scrapeando informacion de diputados desde pagina del parlamento. \nURL: %s."
        % url_to_scrape)

    body = utils.download(url_to_scrape,
                          'legisladores/camara_%s_%s.html' % (cuerpo, hoy),
                          options.get('force', False), options)
    doc = lxml.html.document_fromstring(body)

    tablas = doc.xpath("//table")
    rows = tablas[3].cssselect('tr td')

    diputados = []
    for row in rows:
        mail_base = row.xpath("a[starts-with(@href, 'mailto')]/@href")
        if mail_base:
            email = mail_base[0].split(':', 1)[1]
        else:
            email = ''

        congress_people = {
            'nombre':
            format_word(
                row.xpath('br/following-sibling::text()')[0].split(',')[1]),
            'apellido':
            format_word(
                row.xpath('br/following-sibling::text()')[0].split(',')[0]),
            'partido':
            format_word(row.xpath('br/following-sibling::text()')[1]),
            'email':
            email,
            'foto':
            base_url + row.xpath('img/@src')[0],
            'departamento':
            format_word(row.xpath('br/following-sibling::text()')[2])
        }
        diputados.append(congress_people)

    output_path = "data/diputados.json"

    utils.write(
        json.dumps(diputados,
                   sort_keys=True,
                   indent=2,
                   default=utils.format_datetime,
                   encoding="utf-8"), output_path)
コード例 #42
0
    def __init__(self,):
        sha1sum_id = "72cb19612318bb304d4a169804f525f88dc3f0d0"
        dataset = "petfinder"
        file_name = f"{dataset}_for_unit_tests.zip"
        url = get_repo_url() + file_name
        save_path = os.path.join(get_data_home_dir(), file_name)
        self._path = os.path.join(get_data_home_dir(), dataset)
        download(
            url=url,
            path=save_path,
            sha1_hash=sha1sum_id,
        )
        protected_zip_extraction(
            save_path,
            sha1_hash=sha1sum_id,
            folder=self._path,
        )
        self._train_df = pd.read_csv(os.path.join(self._path, 'train.csv'), index_col=0)
        self._test_df = pd.read_csv(os.path.join(self._path, 'test.csv'), index_col=0)
        for img_col in self.image_columns:
            self._train_df[img_col] = self._train_df[img_col].apply(
                lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            self._test_df[img_col] =\
                self._test_df[img_col].apply(
                    lambda ele: path_expander(ele, base_folder=os.path.join(self._path, "images")))
            print(self._train_df[img_col][0])
            print(self._test_df[img_col][0])

        _, self._train_df = train_test_split(
            self._train_df,
            test_size=0.1,
            random_state=np.random.RandomState(123),
            stratify=self._train_df[self.label_columns[0]],
        )
        _, self._test_df = train_test_split(
            self._test_df,
            test_size=0.1,
            random_state=np.random.RandomState(123),
            stratify=self._test_df[self.label_columns[0]],
        )
        self._train_df.reset_index(drop=True, inplace=True)
        self._test_df.reset_index(drop=True, inplace=True)

        print(f"train sample num: {len(self._train_df)}")
        print(f"test sample num: {len(self._test_df)}")
コード例 #43
0
def prepare_ffmpeg(out_dir=None, version=None):
    ffmpeg_version = "4.1.3"
    output_dir = os.getcwd() + "/test/"
    
    ffmpeg_win32_dev_url = "https://ffmpeg.zeranoe.com/builds/win32/dev/ffmpeg-" + ffmpeg_version + "-win32-dev.zip"
    ffmpeg_win64_dev_url = "https://ffmpeg.zeranoe.com/builds/win64/dev/ffmpeg-" + ffmpeg_version + "-win64-dev.zip"
    ffmpeg_win32_shared_url = "https://ffmpeg.zeranoe.com/builds/win32/shared/ffmpeg-" + ffmpeg_version + "-win32-shared.zip"
    ffmpeg_win64_shared_url = "https://ffmpeg.zeranoe.com/builds/win64/shared/ffmpeg-" + ffmpeg_version + "-win64-shared.zip"

    pre_downloads = [
        ffmpeg_win32_dev_url,
        ffmpeg_win64_dev_url,
        ffmpeg_win32_shared_url,
        ffmpeg_win64_shared_url
    ]

    wanted_downloads = {}
    for d in pre_downloads:
        wanted_downloads[d] = d[d.rfind('/')+1:]

    for url, filename in wanted_downloads.items():
        if not Path(output_dir + filename).is_file() or os.stat(output_dir + filename).st_size == 0:
            print("Downloading " + url + " ...")
            download(url, output_dir + filename)
            print("Extract " + filename + " ...")
            ZipFile(output_dir + filename, 'r').extractall(output_dir)

    # Copy include headers
    shutil.copytree(output_dir + get_dirname_from_link(ffmpeg_win32_dev_url) + "/include", output_dir + "/include/ffmpeg")
    # Copy libraries
    shutil.copytree(output_dir + get_dirname_from_link(ffmpeg_win32_dev_url) + "/lib", output_dir + "/lib/ffmpeg/win32")
    shutil.copytree(output_dir + get_dirname_from_link(ffmpeg_win64_dev_url) + "/lib", output_dir + "/lib/ffmpeg/win64")
    # Copy dlls
    shutil.copytree(output_dir + get_dirname_from_link(ffmpeg_win32_shared_url) + "/bin", output_dir + "/bin/win32")
    shutil.copytree(output_dir + get_dirname_from_link(ffmpeg_win64_shared_url) + "/bin", output_dir + "/bin/win64")

    unnecessary_zip_files = []
    unnecessary_dirs = []
    for url, zip_file in wanted_downloads.items():
        unnecessary_zip_files.append(output_dir + zip_file)
        unnecessary_dirs.append(
            output_dir + zip_file[:zip_file.rfind('.')])

    remove_unnecessary_files_or_dirs(
            unnecessary_zip_files, unnecessary_dirs)
コード例 #44
0
ファイル: fdsys.py プロジェクト: rs19hack/congress
def get_sitemap(year, collection, lastmod, options):
  """Gets a single sitemap, downloading it if the sitemap has changed.
  
  Downloads the root sitemap (year==None, collection==None), or
  the sitemap for a year (collection==None), or the sitemap for
  a particular year and collection. Pass lastmod which is the current
  modification time of the file according to its parent sitemap, which
  is how it knows to return a cached copy.
  
  Returns the sitemap parsed into a DOM.
  """
  
  # Construct the URL and the path to where to cache the file on disk.
  if year == None:
    url = "http://www.gpo.gov/smap/fdsys/sitemap.xml"
    path = "fdsys/sitemap/sitemap.xml"
  elif collection == None:
    url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/sitemap_%s.xml" % (year, year)
    path = "fdsys/sitemap/%s/sitemap.xml" % year
  else:
    url = "http://www.gpo.gov/smap/fdsys/sitemap_%s/%s_%s_sitemap.xml" % (year, year, collection)
    path = "fdsys/sitemap/%s/%s.xml" % (year, collection)
    
  # Should we re-download the file?
  lastmod_cache_file = utils.cache_dir() + "/" + path.replace(".xml", "-lastmod.txt")
  if options.get("cached", False):
    # If --cached is used, don't hit the network.
    force = False
  elif not lastmod:
    # No *current* lastmod date is known for this file (because it is the master
    # sitemap file, probably), so always download.
    force = True
  else:
    # If the file is out of date or --force is used, download the file.
    cache_lastmod = utils.read(lastmod_cache_file)
    force = (lastmod != cache_lastmod) or options.get("force", False)
    
  if force:
    logging.warn("Downloading: %s" % url)
    
  body = utils.download(url, path, utils.merge(options, {
    'force': force, 
    'binary': True
  }))
  
  if not body:
      raise Exception("Failed to download %s" % url)
      
  # Write the current last modified date to disk so we know the next time whether
  # we need to fetch the file.
  if lastmod and not options.get("cached", False):
    utils.write(lastmod, lastmod_cache_file)
  
  try:
    return etree.fromstring(body)
  except etree.XMLSyntaxError as e:
    raise Exception("XML syntax error in %s: %s" % (url, str(e)))
コード例 #45
0
 def search(self, q):
     results = {"movies": [], "people": []}
     q = urlparse.quote(q.encode('utf8'))
     url = self.baseURL + "/search/?q=" + q + "&p=.htm"
     soup = utils.download(url)
     soup = search.trim_search_page(soup)
     results["movies"] = search.movie_search(q, soup)
     results["people"] = search.people_search(soup)
     return results
コード例 #46
0
ファイル: fdsys.py プロジェクト: favila/congress
def mirror_file(year, collection, package_name, lastmod, granule_name,
                file_types, options):
    # Where should we store the file?
    path = get_output_path(year, collection, package_name, granule_name,
                           options)
    if not path: return  # should skip

    # Do we need to update this record?
    lastmod_cache_file = path + "/lastmod.txt"
    cache_lastmod = utils.read(lastmod_cache_file)
    force = ((lastmod != cache_lastmod) or options.get(
        "force", False)) and not options.get("cached", False)

    # Try downloading files for each file type.
    targets = get_package_files(package_name, granule_name, path)
    for file_type in file_types:
        if file_type not in targets:
            raise Exception("Invalid file type: %s" % file_type)
        f_url, f_path = targets[file_type]

        if (not force) and os.path.exists(f_path):
            continue  # we already have the current file
        logging.warn("Downloading: " + f_path)
        data = utils.download(
            f_url, f_path,
            utils.merge(
                options, {
                    'xml':
                    True,
                    'force':
                    force,
                    'to_cache':
                    False,
                    'needs_content':
                    file_type == "text" and f_path.endswith(".html"),
                }))

        if not data:
            if file_type == "pdf":
                # expected to be present for all packages
                raise Exception("Failed to download %s" % package_name)
            else:
                # not all packages have all file types, but assume this is OK
                logging.error("file not found: " + f_url)

        if file_type == "text" and f_path.endswith(".html"):
            # The "text" format files are put in an HTML container. Unwrap it into a .txt file.
            # TODO: Encoding? The HTTP content-type header says UTF-8, but do we trust it?
            #       html.fromstring does auto-detection.
            with open(f_path[0:-4] + "txt", "w") as f:
                text_content = unicode(html.fromstring(data).text_content())
                f.write(text_content.encode("utf8"))

    # Write the current last modified date to disk so we know the next time whether
    # we need to fetch the files for this sitemap item.
    if lastmod and not options.get("cached", False):
        utils.write(lastmod, lastmod_cache_file)
コード例 #47
0
    def process_dent(self, nwo, ext, library_candidates) -> Tuple[List[Dict[str, Any]], List[Tuple[str, str]]]:
        # Process dependents (applications) to get function calls
        dents = []
        edges = []
        _, nwo = remap_nwo(nwo)
        if nwo is None:
            return dents, edges

        tmp_dir = download(nwo)
        files = walk(tmp_dir, ext)
        sha = None

        for f in files:
            context_and_calls = self.get_context_and_function_calls(f)
            if context_and_calls is None:
                continue
            if sha is None:
                sha = get_sha(tmp_dir, nwo)

            nwo, path, context, calls = context_and_calls
            libraries = []
            for cxt in context:
                if type(cxt) == dict:
                    libraries.extend([v.split('.')[0] for v in cxt.values()])
                elif type(cxt) == list:
                    libraries.extend(cxt)

            match_scopes = {}
            for cxt in set(libraries):
                if cxt in library_candidates:
                    match_scopes[cxt] = library_candidates[cxt]

            for call in calls:
                for depended_library_name, dependend_library_functions in match_scopes.items():
                    for depended_library_function in dependend_library_functions:
                        # Other potential filters: len(call['identifier']) > 6 or len(call['identifier'].split('_')) > 1
                        if (call['identifier'] not in self.language_parser.STOPWORDS and
                            ((depended_library_function['identifier'].split('.')[-1] == '__init__' and
                              call['identifier'] == depended_library_function['identifier'].split('.')[0]) or
                             ((len(call['identifier']) > 9 or
                               (not call['identifier'].startswith('_') and len(call['identifier'].split('_')) > 1)) and
                              call['identifier'] == depended_library_function['identifier'])
                            )):
                            dent = {
                                'nwo': nwo,
                                'sha': sha,
                                'path': path,
                                'language': self.language,
                                'identifier': call['identifier'],
                                'argument_list': call['argument_list'],
                                'url': 'https://github.com/{}/blob/{}/{}#L{}-L{}'.format(nwo, sha, path,
                                                                                         call['start_point'][0] + 1,
                                                                                         call['end_point'][0] + 1)
                            }
                            dents.append(dent)
                            edges.append((dent['url'], depended_library_function['url']))
        return dents, edges
コード例 #48
0
ファイル: ibtracs.py プロジェクト: Neo-101/R2S
    def download(self, basin):
        """Download IBTrACS data.

        """
        self.logger.info('Downloading IBTrACS')
        utils.setup_signal_handler()
        utils.set_format_custom_text(
            self.CONFIG['ibtracs']['data_name_length'])

        # url = self.CONFIG['ibtracs']['urls']['since1980']
        url = self.CONFIG['ibtracs']['urls'][basin]
        file = url.split('/')[-1]
        file = file[:-3].replace('.', '_') + '.nc'
        dir = self.CONFIG['ibtracs']['dirs']
        os.makedirs(dir, exist_ok=True)
        self.ibtracs_file_path = f'{dir}{file}'

        utils.download(url, self.ibtracs_file_path, progress=True)
コード例 #49
0
def main(url=JAPANESE_URL):
    content = utils.download(url)
    soup = BeautifulSoup(content)
    text_nodes = soup.find_all(name=["p", "td", "li"], text=True)
    texts = []
    for t in [tn.text for tn in text_nodes]:
        texts.append([t])

    utils.write_file(JAPANESE_TXT, texts)
コード例 #50
0
def main():
    RESOURCE_URL = "http://repositorio.dados.gov.br/saude/unidades-saude/unidade-basica-saude/ubs.csv.zip"
    OUTPUT_PATH = "C:/eclipse/saida.zip"
    EXTRACTED_PATH = "C:/eclipse/" 
    if len(sys.argv) > 1:
       RESOURCE_URL = sys.argv[1] 
    if len(sys.argv) > 2:
        OUTPUT_PATH = sys.argv[2]
    if len(sys.argv) > 3:
        EXTRACTED_PATH = sys.argv[3]
    response = request.urlopen(RESOURCE_URL)
    out_file = io.FileIO(OUTPUT_PATH, mode="w")
    content_length = response.getheader('Content-Length')
    if content_length:
        length = int(content_length)
        dw.download_length(response, out_file, length)
    else:
        dw.download(response, out_file)
    zfile = zipfile.ZipFile(OUTPUT_PATH)
    zfile.extractall(EXTRACTED_PATH)
    
    filename = [name for name in os.listdir(EXTRACTED_PATH) if '.csv' in name]
    
    
    dt = dw.loadlistfromcsv(EXTRACTED_PATH+filename[0])
    
    for t in dt:
        print(t) 
    
    print("Finished")
    
    #dic = dw.dicio(dt)
    
    columns_index = {'cod_munic': 2, 'cod_cnes': 3, 'nome_estab': 4, 'desc_endereco': 5}
    index = ('cod_munic', 'cod_cnes')
    
    dict = dw.create_index_from(dt, columns_index, index)
    
    for t in dict:
        print("{0} : {1} ".format(t, dict[t])) 
    
    response.close()
    out_file.close()
    print("Finished")
コード例 #51
0
def create_task_for_meituan():
    logger = utils.get_logger()
    logger.info('start create task for meituan.')
    add_task_url = common_settings.TASK_URL + common_settings.CREATE_TASK_PATH
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',
    }
    deadline = datetime.datetime.now() + datetime.timedelta(days=1)
    deadline = int(time.mktime(deadline.timetuple())) * 1000
    # for func in [u'客服/技术支持', u'售前/售后服务', u'网络/在线客服', u'客服经理/主管', u'客户关系/投诉协调人员', u'客服咨询热线/呼叫中心人员', u'vip专员', u'售前/售后技术支持', u'其他客服/技术支持职位']:
    #     add_task_data = {
    #         "callSystemID": common_settings.CALLSYSTEMID,
    #         "source": 'CH_HR',
    #         "traceID": str(uuid.uuid1()),
    #         # "executeParam": json.loads(i.strip()),
    #         "executeParam": json.dumps({"zone": u'石家庄', "keyword": func, "degree": 0, "refreshTime": 1, "page_now": 1}, ensure_ascii=False),
    #         "taskType": common_settings.TASK_TYPE,
    #         "deadline": deadline
    #     }
    #     add_task_result = utils.download(url=add_task_url, is_json=True, headers=headers, method='post', data=add_task_data)

    for city in [u'石家庄', u'邢台', u'衡水', u'保定', u'沧州']:
        for function in [
                u'行政', u'行政经理/主管/办公室主任', u'行政专员/助理', u'文员/文秘/秘书/助理',
                u'内勤/后勤/总务', u'前台/总机/接待', u'商务/行政司机', u'其他行政职位', u'客服/技术支持',
                u'售前/售后服务', u'网络/在线客服', u'客服经理/主管', u'客户关系/投诉协调人员',
                u'客服咨询热线/呼叫中心人员', u'vip专员', u'售前/售后技术支持', u'其他客服/技术支持职位'
        ]:
            add_task_data = {
                "callSystemID":
                common_settings.CALLSYSTEMID,
                "source":
                'CH_HR',
                "traceID":
                str(uuid.uuid1()),
                # "executeParam": json.loads(i.strip()),
                "executeParam":
                json.dumps(
                    {
                        "zone": city,
                        "keyword": function,
                        "degree": 0,
                        "refreshTime": 1,
                        "page_now": 1
                    },
                    ensure_ascii=False),
                "taskType":
                common_settings.TASK_TYPE,
                "deadline":
                deadline
            }
            add_task_result = utils.download(url=add_task_url,
                                             is_json=True,
                                             headers=headers,
                                             method='post',
                                             data=add_task_data)
コード例 #52
0
def crawl(path, pid=None):
    body = download("http://petitions.whitehouse.gov" + path,
                    path.split('/')[2] + ".html")
    page = etree.parse(StringIO(body), parser)
    #catch page text whether or not petition is still active
    #http://stackoverflow.com/questions/5662404/how-can-i-select-an-element-with-multiple-classes-with-xpath
    text = "\n".join(
        page.xpath(
            "//div[contains(concat(' ',@class,' '),' petition-detail')]/p/text()"
        ))

    #check if expired
    if "The petition you are trying to access has expired" in text:
        return {"status": "expired"}

    #if raw_date not found, probably a bad link (or change in HTML, so we should be careful)
    try:
        raw_date = page.xpath("//div[@class='date']/text()")[0].strip()
    except:
        return {"status": "error", "reason": "no date"}

    created = datetime.strptime(raw_date, "%b %d, %Y").strftime("%Y-%m-%d")
    signatures = page.xpath("//div[@class='num-block num-block2']/text()")

    #indiciates possible response
    if len(signatures) == 0:
        signatures = page.xpath("//div[@class='num-block']/text()")
        response = page.xpath(
            "//div[contains(concat(' ',@class,' '),' petition-response')]")
        if response:
            status = "answered"
        else:
            return {"status": "error", "reason": "no signatures"}
    else:
        status = "active"
    signatures = int(signatures[0].replace(",", ''))

    if not pid:
        #no pid if fewer than 20 signatures
        try:
            pid = page.xpath(
                "//a[@class='load-next no-follow active']/@rel")[0]
        except:
            pid = "N/A"

    return {
        "id": pid,
        "status": status,
        "title": page.xpath("//h1[@class='title']/text()")[0].strip(),
        "body": text,
        "issues": page.xpath("//div[@class='issues']/a/text()"),
        "created": created,
        "visited": datetime.now().strftime("%Y-%m-%d-%H:%M:%S"),
        "signature_count": signatures,
        "url": "http://petitions.whitehouse.gov" + path
    }
コード例 #53
0
    def get_lrc(self, song, try_web=True):

        lrc_path = self.get_lrc_filepath(song)

        # user allocation lrc
        location_lrc = song.get("location_lrc", "")
        if location_lrc and os.path.exists(location_lrc):
            return location_lrc

        # lrc already exist
        if os.path.exists(lrc_path):
            if self.vaild_lrc(lrc_path):
                return lrc_path
            else:
                try:
                    os.unlink(lrc_path)
                except:
                    pass

        # search in current directory and same name file
        current_lrc_path = os.path.join(song.get_dir(),
                                        song.get_filename() + ".lrc")
        if os.path.exists(current_lrc_path) and self.vaild_lrc(
                current_lrc_path):
            return current_lrc_path

        # Search in local directory of the file
        if song.get("uri") != None and song.get_scheme() == "file":
            local_lrc = os.path.join(song.get_dir(),
                                     self.get_lrc_search_str(song))
            if os.path.exists(local_lrc):
                return local_lrc

        if try_web and is_network_connected():
            if song.get("lyric_url", None):
                ret = utils.download(song.get("lyric_url"), lrc_path)
                if ret:
                    return lrc_path

            trust_a = song.get_str("artist")
            trust_t = song.get_str("title")
            filename = song.get_filename()
            if "-" in filename:
                untrust_a = filename.split("-")[0].strip()
                untrust_t = filename.split("-")[1].strip()
            else:
                untrust_a = song.get_str("artist")
                untrust_t = song.get_filename()
            trust_result = self.multiple_engine(song, lrc_path, trust_a,
                                                trust_t)
            if trust_result:
                return trust_result
            else:
                return self.multiple_engine(song, lrc_path, untrust_a,
                                            untrust_t)
        return None
コード例 #54
0
def fetch_vote(vote_id, options):
  logging.info("\n[%s] Fetching..." % vote_id)
  
  vote_chamber, vote_number, vote_congress, vote_session_year = utils.split_vote_id(vote_id)
  
  if vote_chamber == "h":
    url = "http://clerk.house.gov/evs/%s/roll%03d.xml" % (vote_session_year, int(vote_number))
  else:
    session_num = int(vote_session_year) - utils.get_congress_first_year(int(vote_congress)) + 1
    url = "http://www.senate.gov/legislative/LIS/roll_call_votes/vote%d%d/vote_%d_%d_%05d.xml" % (int(vote_congress), session_num, int(vote_congress), session_num, int(vote_number))
  
  # fetch vote XML page
  body = utils.download(
    url, 
    "%s/votes/%s/%s%s/%s%s.xml" % (vote_congress, vote_session_year, vote_chamber, vote_number, vote_chamber, vote_number),
    utils.merge(options, {'binary': True}),
    )

  if not body:
    return {'saved': False, 'ok': False, 'reason': "failed to download"}

  if options.get("download_only", False):
    return {'saved': False, 'ok': True, 'reason': "requested download only"}

  if "This vote was vacated" in body:
    # Vacated votes: 2011-484, 2012-327, ...
    # Remove file, since it may previously have existed with data.
    for f in (output_for_vote(vote_id, "json"), output_for_vote(vote_id, "xml")):
      if os.path.exists(f):
        os.unlink(f)
    return {'saved': False, 'ok': True, 'reason': "vote was vacated"}

  dom = etree.fromstring(body)

  vote = {
    'vote_id': vote_id,
    'chamber': vote_chamber,
    'congress': int(vote_congress),
    'session': vote_session_year,
    'number': int(vote_number),
    'updated_at': datetime.datetime.fromtimestamp(time.time()),
    'source_url': url,
  }
  
  # do the heavy lifting
  
  if vote_chamber == "h":
    parse_house_vote(dom, vote)
  elif vote_chamber == "s":
    parse_senate_vote(dom, vote)
    
  # output and return
  
  output_vote(vote, options)

  return {'ok': True, 'saved': True}
コード例 #55
0
ファイル: create_task.py プロジェクト: logonmy/Spider-1
def create_task_for_meituan():
    logger = utils.get_logger()
    logger.info('start create task for meituan.')

    logger = utils.get_logger()
    logger.info('start create task from mysql.')
    mysql_pool = PersistentDB(
        MySQLdb,
        host=common_settings.MYSQL_HOST,
        user=common_settings.MYSQL_USER,
        passwd=common_settings.MYSQL_PASSWD,
        db=common_settings.MYSQL_DB,
        port=common_settings.MYSQL_PORT,
        charset='utf8'
    )
    conn = mysql_pool.connection()
    cur = conn.cursor()
    function_number = cur.execute(
        'select * from function_entrence where source="ZHAO_PIN_GOU" '
        'and valid=1 and thirdFunctionCode in '
        '(262, 265, 261, 257, 256, 252, 253, 250, 254, 370, 372, 371, 369)')
    functions = cur.fetchall()
    logger.info('the number of functions is: %s' % (function_number))
    if not function_number:
        return

    add_task_url = common_settings.TASK_URL + common_settings.CREATE_TASK_PATH
    headers = {'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8', }
    deadline = datetime.datetime.now() + datetime.timedelta(days=1)
    deadline = int(time.mktime(deadline.timetuple())) * 1000

    city_dict = {
        u'石家庄': '7',
        u'邢台': '11',
        u'衡水': '17',
        u'保定': '12',
        u'沧州': '15',
        u'扬州': '66',
    }

    for city in [u'石家庄', u'邢台', u'衡水', u'保定', u'沧州']:
        for function in functions:
            add_task_data = {
                "callSystemID": "morgan-zhaopingou-resume-1",
                "source": 'ZHAO_PIN_GOU',
                "traceID": str(uuid.uuid1()),
                # "executeParam": json.loads(i.strip()), 
                "executeParam": json.dumps(
                    {"fenleiName": function[4], "pFenLeiName": function[1], "positionName": function[7],
                     "hopeAdressStr": city_dict[city], "fId": int(function[5]), "pFId": int(function[2]),
                     "pId": int(function[8]), "id": int(function[11])}, ensure_ascii=False),
                "taskType": "RESUME_FETCH",
                "deadline": deadline
            }
            add_task_result = utils.download(url=add_task_url, is_json=True, headers=headers, method='post',
                                             data=add_task_data)
コード例 #56
0
 def __init__(self, input_img):
     # 下载文件
     filename = utils.download(VGG_DOWNLOAD_LINK, VGG_FILENAME,
                               EXPECTED_BYTES)
     # 加载文件
     self.vgg_layers = scipy.io.loadmat(filename)["layers"]
     self.input_img = input_img
     # VGG在处理图像时候会将图片进行mean-center,所以我们首先要计算RGB三个channel上的mean
     self.mean_pixels = np.array([123.68, 116.779, 103.939]).reshape(
         (1, 1, 1, 3))
コード例 #57
0
def run(options):
    # Download the TSV file.
    cache_zip_path = "adler-wilkerson-bills.zip"
    utils.download(
        "http://congressionalbills.org/billfiles/bills80-92.zip",
        cache_zip_path,
        utils.merge(options, {
            'binary': True,
            'needs_content': False
        }))

    # Unzip in memory and process the records.
    zfile = zipfile.ZipFile(utils.cache_dir() + "/" + cache_zip_path)
    csvreader = csv.DictReader(zfile.open("bills80-92.txt"), delimiter="\t")
    for record in csvreader:
        rec = process_bill(record)

        import pprint
        pprint.pprint(rec)
コード例 #58
0
    def get_feature(self, feature):
        """The unified API for getting specified features"""
        feature_path = os.path.join(self.location, feature + '.pkl')
        feature_present = os.path.exists(feature_path)
        if not feature_present:
            downloaded = download(self.dataset, feature, self.location)
            if not downloaded:
                return None

        # TODO: check MD5 values and etc. to ensure the downloaded dataset's intact
        with open(feature_path, 'rb') as fp:
            try:
                feature_values = load(fp)
            except:
                print "The previously downloaded dataset is compromised, downloading a new copy..."
                dowloaded = download(self.dataset, feature, self.location)
                if not downloaded:
                    return None
        return feature_values
コード例 #59
0
def integracion(href_comp, name, cuerpo, options):
    url = base_url + '/GxEmule/' + href_comp

    body = utils.download(url, 'comisiones/' + name + '.html',
                          options.get('force', False), options)
    doc = lxml.html.document_fromstring(body)
    rows = doc.xpath("//div[contains(@style,'border:0px solid #006699')]/div"
                     )[0].xpath("//div[contains(@style,'width:750px')]/div")
    divs = rows[0].cssselect('div')

    result = {}
    pre_res = []
    lineas = 1
    top = 0
    start = False
    for div in divs:
        if (lineas == top):
            result[cat] = pre_res
            break
        elif div.text_content().strip() == 'Miembros':
            cat = 'miembros'
            start = True
        elif div.text_content().strip() == u'Secretaría':
            # agregamos los miembros
            cat = 'miembros'
            result[cat] = pre_res
            pre_res = []
            cat = 'secretaria'
        elif div.text_content().strip() == 'Reuniones':
            # agregamos la secretaria
            cat = 'secretaria'
            result[cat] = pre_res
            cat = 'reuniones'
            pre_res = []
            top = lineas + 2
        elif start:
            #store data
            data = {
                'text': div.text_content().strip(),
                'tipo': cat,
                'cuerpo': cuerpo,
            }
            pre_res.append(data)
        lineas += 1
        email_exists = doc.xpath("//a[starts-with(@href, 'mailto')]/@href")
        if email_exists:
            email = email_exists[0].split(':', 1)[1]
        else:
            email = 'none'
        data = {
            'correo': email,
            'cuerpo': cuerpo,
        }
        result['email'] = data
    return result
コード例 #60
0
def create_task_from_mysql(use_keyword='0'):
    logger = utils.get_logger()
    logger.info('start create task from mysql.')
    mysql_pool = PersistentDB(
        MySQLdb, 
        host=common_settings.MYSQL_HOST, 
        user=common_settings.MYSQL_USER,
        passwd=common_settings.MYSQL_PASSWD, 
        db=common_settings.MYSQL_DOWNLOAD_DB,
        port=common_settings.MYSQL_PORT, 
        charset='utf8'
    )
    conn = mysql_pool.connection()
    cur = conn.cursor()
    city_number = cur.execute('select * from city_entrence where source="REN_CAI" and valid=1')
    cities = cur.fetchall()
    function_number = cur.execute('select * from function_entrence where source="REN_CAI" and valid=1')
    functions = cur.fetchall()
    logger.info('the number of city and functions is:%s, %s' % (city_number, function_number))
    if not city_number or not function_number:
        return
    add_task_url = common_settings.TASK_URL +common_settings.CREATE_TASK_PATH
    headers = {'Content-Type': 'application/x-www-form-urlencoded;charset=UTF-8',}
    
    today = datetime.datetime.today()
    next_datetime = datetime.datetime(today.year, today.month, today.day, 0, 0, 0) + datetime.timedelta(days=1)
    deadline = int(time.mktime(time.strptime(next_datetime.strftime('%Y-%m-%d %H:%M:%S'), '%Y-%m-%d %H:%M:%S'))) * 1000

    city_result = []
    # use_keyword = '0' if datetime.datetime.now().hour < 12 else '1'
    
    if cities:
        city_dict = {i[1]: i for i in cities}
        for i in city_order:
            if i in city_dict:
                city_result.append(city_dict[i])
                city_dict.pop(i)
        city_result = city_result + city_dict.values()

    for city in city_result:
        for function in functions:
            add_task_data = {
                "callSystemID": 'morgan-rencaia-resume-1', 
                "source": 'REN_CAI', 
                "traceID": str(uuid.uuid1()), 
                # "executeParam": json.loads(i.strip()), 
                # "executeParam": json.dumps({'residence_ids': city[6], 'residence_name': city[1], 
                "executeParam": json.dumps({ 'function_ids3': function[8], 'function_id_name': function[7], 'residence_ids': city[6], 'residence_name': city[1], 
                'use_keyword': use_keyword}, ensure_ascii=False), 
                "taskType": "RESUME_FETCH",
                "deadline": deadline,
            }
            add_task_result = utils.download(url=add_task_url, is_json=True, headers=headers, method='post', data=add_task_data)
            
    logger.info('done.')