Beispiel #1
0
def download(url, usecache=True, cached=None, cachedir='cache~/', cachedonly=False, **opts):
    """
    Download (or cache) ``url`` to file. On success: return file name of stored
    contents. Upon failure: return None.

    Will retry ``tries`` times with ``pause`` seconds between each attempt to
    download.

    Download will timeout after ``timeout`` seconds.

    If ``cachedonly`` is enabled, this function will not download anything. It
    will simply return the cached filename if it exists.
    """

    if not cached:
        if cachedir:
            mkdir(cachedir)
            cached = os.path.join(cachedir, secure_filename(url))
        else:
            assert not usecache, 'must specify cachedir'

    # only return something for cached files
    if cachedonly and not os.path.exists(cached):
        return

    if usecache and os.path.exists(cached):
        return cached

    # use wget for ftp files
    if url.startswith('ftp'):
        return wget(url, cached)

    if url.startswith('http'):
        return robust_download(url, cached, **opts)
Beispiel #2
0
    def __init__(self,
                 corpus,
                 Y,
                 train,
                 dev,
                 initial_contexts,
                 outer_iterations,
                 inner_iterations,
                 group_budget,
                 regularizer,
                 allowed_contexts,
                 dump,
                 no_failure_arcs=0):

        self.no_failure_arcs = no_failure_arcs  # if true, runs model with last-char subst closure.

        # Create initial pattern set.
        VoCRF.__init__(self, Y, initial_contexts)

        self.dump = None
        if dump is not None:
            self.dump = Path(dump)
            mkdir(self.dump)

        self.corpus = corpus
        self.dev_best = -np.inf

        # the set of allowed contexts must be prefix closed to make sense.
        self.allowed_contexts = None
        if allowed_contexts is not None:
            self.allowed_contexts = set(prefix_closure(allowed_contexts))

        self.train = train
        self.dev = dev

        # max number of higher-order features =
        #              budget        [green nodes - the max number of 'active' contexts at any time]
        #  x       extensions = |Y|  [yellow nodes - a little room to grow]
        #  x number of labels        [because that's how we encode features]   XXX: I think this is an overestimate we want |states| x |labels|
        self.H = max(group_budget * len(Y), len(self.C)) * self.A
        self.D = MAGIC * self.A

        self.group_budget = group_budget
        self.regularizer = regularizer / len(self.train)

        L = 2 if regularizer > 0 else -1
        self.sparse = LazyRegularizedAdagrad(self.D, L=L, C=self.regularizer)
        self.dense = OnlineProx(self.group_structure(),
                                self.H,
                                L=L,
                                C=self.regularizer)

        self.inner_iterations = inner_iterations
        self.outer_iterations = outer_iterations

        self.log = []
Beispiel #3
0
def download(url,
             usecache=True,
             cached=None,
             cachedir='cache~/',
             cachedonly=False,
             **opts):
    """
    Download (or cache) ``url`` to file. On success: return file name of stored
    contents. Upon failure: return None.

    Will retry ``tries`` times with ``pause`` seconds between each attempt to
    download.

    Download will timeout after ``timeout`` seconds.

    If ``cachedonly`` is enabled, this function will not download anything. It
    will simply return the cached filename if it exists.
    """

    if not cached:
        if cachedir:
            mkdir(cachedir)
            cached = os.path.join(cachedir, secure_filename(url))
        else:
            assert not usecache, 'must specify cachedir'

    # only return something for cached files
    if cachedonly and not os.path.exists(cached):
        return

    if usecache and os.path.exists(cached):
        return cached

    # use wget for ftp files
    if url.startswith('ftp'):
        return wget(url, cached)

    if url.startswith('http'):
        return robust_download(url, cached, **opts)