def retry(url,
          requestor=None,
          n_tries=2,
          sleep=None,
          good_codes=(200, ),
          reraise=True,
          always_sleep=True):
    do_request = (lambda: requestor(url)) if is_(requestor) \
            else (lambda: request(url))
    for i in range(n_tries):
        try:
            r = do_request()
            if always_sleep and is_(sleep):
                slp(sleep)
            if r.status_code in good_codes:
                return r
            raise BadStatusCode(r.status_code)
        except:
            if i >= n_tries - 1:
                if reraise: raise
                else: return
            prefix = '\nBAD REQUEST, RETRY (%s/%s)' % (i + 2, n_tries)
            log_web_err(url, kind='warning', prefix=prefix)
            if not always_sleep and is_(sleep):
                slp(sleep)
def request(url, kind='get', sleep=None, **kw):
    resp = getattr(requests, kind)(url,
                                   headers=get_ua(),
                                   proxies=get_proxy(),
                                   **kw)
    if is_(sleep):
        slp(sleep)
    return resp
def get_imgs(query, lang=None, debug=None):
    query_url = QUERY_URL(query)
    if is_(lang):
        query_url += '&' + lang_params(lang)
    if debug:
        print('query_url for %s: %s' % (query, query_url))
    resp = request(query_url, sleep=(.5, 1.5))
    soup = BeautifulSoup(resp.text, 'lxml')
    return get_imgs_from_soup(soup)
def get_proxy(p=None, rotate=True):
    if not PROXIES:
        return

    if is_(p):
        proxy_addr = p
    elif rotate:
        proxy_addr = current_proxy()
    else:
        proxy_addr = random.choice(PROXIES)

    return {
        'http': PROXY_HTTP_URL.format(ip=proxy_addr),
        'https': PROXY_HTTPS_URL.format(ip=proxy_addr)
    }
def get_info(w, lang=None, parser=None, debug=False, postfix=None):
    try:
        if parser is None:
            parser = WiktionaryParser()
        info = parser.fetch(w, lang)[0]['definitions']
        pos_list, morph_list = [], []
        for info_dct in info:
            pos = info_dct['partOfSpeech']
            txt = info_dct['text']
            pos_list.append(pos)
            morph_list.append(wp_morph(txt, pos))
        result_dict = {
            'w': w,
            'src': lang[:2],
            'pos_wp': pos_list,
            'morph': morph_list
        }
        return {k + postfix: v for k, v in result_dict.items()} \
            if is_(postfix) else result_dict
    except:
        if debug:
            tb.print_exc()
        return {}
Ejemplo n.º 6
0
    arg('-q', '--query', nargs='*'),
    arg('-s', '--src', default='fr'),
    arg('-t', '--target', default='en'),
    arg('-n', '--n-img', type=int, default=20),

    # pipeline
    arg('-is', '--sch', action='store_true'),
    arg('-rs', '--rsch', action='store_true'),
    arg('-pred', action='store_true'),

    # use saved
    arg('-load-urls'),
    arg('-load-preds'),
)

name = opts.name + '__' if is_(opts.name) else ''
RESULT_PREFIX = osp.join('reverse-img-final-preds',
                         '%s_to_%s' % (opts.src, opts.target),
                         name + time_stamp())
mkdir_p(RESULT_PREFIX)

fh = init_logging(file=osp.join(RESULT_PREFIX, 'log.log'), stdout=True)
LOGGER = get_logger(__name__, main=True)

from nlp_utils import get_words
from image_search import image_search
from reverse_image_search import reverse_search_urls

queries = []
if is_(opts.file):
    queries.extend(get_words(opts.file, i=opts.start, j=opts.stop))