Esempio n. 1
0
def get_conf():
    """TODO: Docstring for get_conf.
    :returns: TODO

    """
    parser = ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
    parser.add_argument('-d', '--dict', required=True, help='Path to dictionary with phrases')
    parser.add_argument('-o', '--output', default='web', help='Mongo DB collection for saving corpora')
    parser.add_argument('--lang1', required=True, help='Language of first phrase in dictionary')
    parser.add_argument('--lang2', required=True, help='Language of second phrase in dictionary')
    parser.add_argument('-r', '--restart', action='store_true', help='Restart retrieving text')
    parser.add_argument('--min-phrase-probability', default=None, help='Minimal probability value for phrase pair')
    parser.add_argument('--results-for-phrase', default=None, help='Number of results to save for each phrase')
    parser.add_argument('-t', '--threads', default=5, type=int, help='Number of threads for downloading html from websites')
    parser.add_argument('--debug', action='store_true', help='Debug mode')
    conf = parser.parse_args()

    yaml_file = rel_path(__file__, 'conf', 'webcorpora.yaml')
    with open(yaml_file) as f:
        yaml_conf = yaml.safe_load(f) or {}
    for key, val in yaml_conf.items():
        if not hasattr(conf, key) or getattr(conf, key) is None:
            setattr(conf, key, val)
    return conf
Esempio n. 2
0
    :phrase: TODO
    :returns: TODO

    """
    log.debug('get_text_async')
    return list(get_text([phrase], lang1, lang2, n))

if __name__ == '__main__':
    conf = get_conf()
    if conf.debug:
        logging.getLogger().setLevel(logging.DEBUG)
    else:
        logging.getLogger('requests').setLevel(logging.ERROR)
        logging.getLogger('urllib3').setLevel(logging.ERROR)
    state = shelve.open(rel_path(__file__, 'conf', 'state.db'))
    log.debug('State %s', state)
    log.debug('Conf %s', conf)
    state_id = os.path.abspath(conf.dict)
    skip = state.setdefault(state_id, 0)

    if conf.restart:
        skip = 0
        state[state_id] = 0

    pool = Pool(conf.threads, init_async,
                (conf.lang1, conf.lang2, conf.results_for_phrase,
                 conf.google_delay*conf.threads, conf.google_big_delay))
    data_iter = read_bidict(conf.dict, conf.min_phrase_probability, skip=skip)
    data_iter = progress_updater(state, state_id, data_iter, skip)