def retry(url, requestor=None, n_tries=2, sleep=None, good_codes=(200, ), reraise=True, always_sleep=True): do_request = (lambda: requestor(url)) if is_(requestor) \ else (lambda: request(url)) for i in range(n_tries): try: r = do_request() if always_sleep and is_(sleep): slp(sleep) if r.status_code in good_codes: return r raise BadStatusCode(r.status_code) except: if i >= n_tries - 1: if reraise: raise else: return prefix = '\nBAD REQUEST, RETRY (%s/%s)' % (i + 2, n_tries) log_web_err(url, kind='warning', prefix=prefix) if not always_sleep and is_(sleep): slp(sleep)
def request(url, kind='get', sleep=None, **kw): resp = getattr(requests, kind)(url, headers=get_ua(), proxies=get_proxy(), **kw) if is_(sleep): slp(sleep) return resp
def get_imgs(query, lang=None, debug=None): query_url = QUERY_URL(query) if is_(lang): query_url += '&' + lang_params(lang) if debug: print('query_url for %s: %s' % (query, query_url)) resp = request(query_url, sleep=(.5, 1.5)) soup = BeautifulSoup(resp.text, 'lxml') return get_imgs_from_soup(soup)
def get_proxy(p=None, rotate=True): if not PROXIES: return if is_(p): proxy_addr = p elif rotate: proxy_addr = current_proxy() else: proxy_addr = random.choice(PROXIES) return { 'http': PROXY_HTTP_URL.format(ip=proxy_addr), 'https': PROXY_HTTPS_URL.format(ip=proxy_addr) }
def get_info(w, lang=None, parser=None, debug=False, postfix=None): try: if parser is None: parser = WiktionaryParser() info = parser.fetch(w, lang)[0]['definitions'] pos_list, morph_list = [], [] for info_dct in info: pos = info_dct['partOfSpeech'] txt = info_dct['text'] pos_list.append(pos) morph_list.append(wp_morph(txt, pos)) result_dict = { 'w': w, 'src': lang[:2], 'pos_wp': pos_list, 'morph': morph_list } return {k + postfix: v for k, v in result_dict.items()} \ if is_(postfix) else result_dict except: if debug: tb.print_exc() return {}
arg('-q', '--query', nargs='*'), arg('-s', '--src', default='fr'), arg('-t', '--target', default='en'), arg('-n', '--n-img', type=int, default=20), # pipeline arg('-is', '--sch', action='store_true'), arg('-rs', '--rsch', action='store_true'), arg('-pred', action='store_true'), # use saved arg('-load-urls'), arg('-load-preds'), ) name = opts.name + '__' if is_(opts.name) else '' RESULT_PREFIX = osp.join('reverse-img-final-preds', '%s_to_%s' % (opts.src, opts.target), name + time_stamp()) mkdir_p(RESULT_PREFIX) fh = init_logging(file=osp.join(RESULT_PREFIX, 'log.log'), stdout=True) LOGGER = get_logger(__name__, main=True) from nlp_utils import get_words from image_search import image_search from reverse_image_search import reverse_search_urls queries = [] if is_(opts.file): queries.extend(get_words(opts.file, i=opts.start, j=opts.stop))