Esempio n. 1
0
def big_kmeans(docs, k, batch_size=1000, n_features=(2**20), single_pass=True):
    """k-means for very large sets of documents.

    See kmeans for documentation. Differs from that function in that it does
    not computer tf-idf or LSA, and fetches the documents in a streaming
    fashion, so they don't need to be held in memory. It does not do random
    restarts.

    If the option single_pass is set to False, the documents are visited
    twice: once to fit a k-means model, once to determine their label in
    this model.
    """
    from sklearn.cluster import MiniBatchKMeans
    from sklearn.feature_extraction.text import HashingVectorizer

    docs = tosequence(docs)

    v = HashingVectorizer(input="content", n_features=n_features, norm="l2")
    km = MiniBatchKMeans(n_clusters=k)

    labels = []
    for batch in toolz.partition_all(batch_size, docs):
        batch = map(fetch, docs)
        batch = v.transform(batch)
        y = km.fit_predict(batch)
        if single_pass:
            labels.extend(y.tolist())

    if not single_pass:
        for batch in toolz.partition_all(batch_size, docs):
            batch = map(fetch, docs)
            batch = v.transform(batch)
            labels.extend(km.predict(batch).tolist())

    return group_clusters(docs, labels)
Esempio n. 2
0
def test_from_to_iterable(nums):

    nums_pl = nums
    nums_pl = pl.sync.from_iterable(nums_pl)
    nums_pl = cz.partition_all(10, nums_pl)
    nums_pl = pl.sync.map(sum, nums_pl)
    nums_pl = list(nums_pl)

    nums_py = nums
    nums_py = cz.partition_all(10, nums_py)
    nums_py = map(sum, nums_py)
    nums_py = list(nums_py)

    assert nums_py == nums_pl
Esempio n. 3
0
def test_iterable_and_map(nums):

    nums_pl = nums
    nums_pl = pl.task.from_iterable(nums_pl)
    nums_pl = cz.partition_all(10, nums_pl)
    nums_pl = pl.task.map(sum, nums_pl)
    nums_pl = list(nums_pl)

    nums_py = nums
    nums_py = cz.partition_all(10, nums_py)
    nums_py = map(sum, nums_py)
    nums_py = list(nums_py)

    assert nums_py == nums_pl
Esempio n. 4
0
async def test_from_to_iterable_2(nums: tp.List[int]):

    nums_pl = nums
    nums_pl = pl.task.from_iterable(nums_pl)
    nums_pl = cz.partition_all(10, nums_pl)
    nums_pl = pl.task.map(sum, nums_pl)
    nums_pl = pl.task.to_async_iterable(nums_pl)
    nums_pl = [x async for x in nums_pl]

    nums_py = nums
    nums_py = cz.partition_all(10, nums_py)
    nums_py = map(sum, nums_py)
    nums_py = list(nums_py)

    assert nums_py == nums_pl
Esempio n. 5
0
def test_from_to_iterable(nums: tp.List[int]):

    nums_pl = nums
    nums_pl = pl.thread.from_iterable(nums_pl)
    nums_pl = cz.partition_all(10, nums_pl)
    nums_pl = pl.thread.map(sum, nums_pl)
    nums_pl = pl.thread.to_iterable(nums_pl)
    nums_pl = list(nums_pl)

    nums_py = nums
    nums_py = cz.partition_all(10, nums_py)
    nums_py = map(sum, nums_py)
    nums_py = list(nums_py)

    assert nums_py == nums_pl
Esempio n. 6
0
def test_benchmark_RNN_fwd():
    nO = 128
    nI = 128
    n_batch = 1000
    batch_size = 30
    seq_len = 30
    lengths = numpy.random.normal(scale=10, loc=30, size=n_batch * batch_size)
    lengths = numpy.maximum(lengths, 1)
    batches = []
    uniform_lengths = False
    for batch_lengths in partition_all(batch_size, lengths):
        batch_lengths = list(batch_lengths)
        if uniform_lengths:
            seq_len = max(batch_lengths)
            batch = [
                numpy.asarray(numpy.random.uniform(0., 1., (int(seq_len), nI)),
                              dtype='f') for _ in batch_lengths
            ]
        else:
            batch = [
                numpy.asarray(numpy.random.uniform(0., 1., (int(seq_len), nI)),
                              dtype='f') for seq_len in batch_lengths
            ]
        batches.append(batch)
    model = LSTM(nO, nI)
    start = timeit.default_timer()
    for Xs in batches:
        ys, bp_ys = model.begin_update(list(Xs))
        #_ = bp_ys(ys)
    end = timeit.default_timer()
    n_samples = n_batch * batch_size
    print("--- %i samples in %s seconds (%f samples/s, %.7f s/sample) ---" %
          (n_samples, end - start, n_samples / (end - start),
           (end - start) / n_samples))
Esempio n. 7
0
 def __iter__(self):
     ids = self._create_ids()
     random.shuffle(ids)
     buckets = [
         sorted(ids[i:i + self._bucket_size],
                key=self.sort_fn,
                reverse=True) for i in range(0, len(ids), self._bucket_size)
     ]
     # fill batches until max_token (include padding)
     batches = []
     for bucket in buckets:
         max_len = 0
         batch_indices = []
         for indices in partition_all(self._mul, bucket):
             # make sure batch size is multiple of 8
             ind_max = max(self._keys[i] for i in indices)
             ind_max = max(ind_max)  # max of src/tgt
             max_len = max(max_len, ind_max)
             if max_len * (len(batch_indices) + self._mul) > self._max_tok:
                 if not batch_indices:
                     raise ValueError(
                         "max_tokens too small / max_seq_len too long")
                 batches.append(batch_indices)
                 batch_indices = list(indices)
                 max_len = ind_max
             else:
                 batch_indices.extend(indices)
         if not self._droplast and batch_indices:
             batches.append(batch_indices)
     random.shuffle(batches)
     return iter(batches)
Esempio n. 8
0
def test_benchmark_RNN_fwd():
    nO = 128
    nI = 128
    n_batch = 1000
    batch_size = 30
    seq_len = 30
    lengths = numpy.random.normal(scale=10, loc=30, size=n_batch*batch_size)
    lengths = numpy.maximum(lengths, 1)
    batches = []
    uniform_lengths = False
    for batch_lengths in partition_all(batch_size, lengths):
        batch_lengths = list(batch_lengths)
        if uniform_lengths:
            seq_len = max(batch_lengths)
            batch = [numpy.asarray(
                numpy.random.uniform(0., 1., (int(seq_len), nI)), dtype='f')
                for _ in batch_lengths
            ]
        else:
            batch = [numpy.asarray(
                numpy.random.uniform(0., 1., (int(seq_len), nI)), dtype='f')
                for seq_len in batch_lengths
            ]
        batches.append(batch)
    model = LSTM(nO, nI)
    start = timeit.default_timer()
    for Xs in batches:
        ys, bp_ys = model.begin_update(list(Xs))
        #_ = bp_ys(ys)
    end = timeit.default_timer()
    n_samples = n_batch * batch_size
    print("--- %i samples in %s seconds (%f samples/s, %.7f s/sample) ---" % (n_samples, end - start, n_samples / (end - start), (end - start) / n_samples))
Esempio n. 9
0
    def __iter__(self):
        ids = self._create_ids()
        random.shuffle(ids)
        buckets = [
            sorted(ids[i:i + self._bucket_size],
                   key=self._sort_fn,
                   reverse=True) for i in range(0, len(ids), self._bucket_size)
        ]
        batches = []
        for bucket in buckets:
            max_len = 0
            batch_indices = []

            # Partition the bucket into tuples of length at most self._size_mul
            for indices in partition_all(self._size_mul, bucket):
                max_len = max(max_len, max(self._lens[i] for i in indices))
                # fill batches until max_token (include padding)
                if max_len * (len(batch_indices) +
                              self._size_mul) > self._max_tok:
                    if not batch_indices:
                        raise ValueError(
                            "max_tokens too small / max_seq_len too long")
                    assert len(batch_indices) % self._size_mul == 0
                    batches.append(batch_indices)
                    batch_indices = list(indices)
                else:
                    batch_indices.extend(indices)
            if not self._droplast and batch_indices:
                batches.append(batch_indices)
        random.shuffle(batches)
        return iter(batches)
Esempio n. 10
0
 def __iter__(self):
     ids = self._create_ids()
     random.shuffle(ids)
     # 将整个数据集构建 len(buckets) 个 _bucket_size 大小的 bucket.
     # anwen hu note:in each bucket, instances are ranked according the entire sequence length (txt + num_bb)
     buckets = [sorted(ids[i:i+self._bucket_size],
                       key=self._sort_fn, reverse=True)
                for i in range(0, len(ids), self._bucket_size)]
     # fill batches until max_token (include padding)
     batches = []
     for bucket in buckets:
         max_len = 0
         batch_indices = []
         for indices in partition_all(self._size_mul, bucket):
             max_len = max(max_len, max(self._lens[i] for i in indices))
             if (max_len * (len(batch_indices) + self._size_mul) > self._max_tok):
                 if not batch_indices:
                     raise ValueError(
                         "max_tokens too small / max_seq_len too long")
                 assert len(batch_indices) % self._size_mul == 0
                 batches.append(batch_indices)
                 batch_indices = list(indices)
             else:
                 batch_indices.extend(indices)
         if not self._droplast and batch_indices:
             batches.append(batch_indices)
     random.shuffle(batches)
     return iter(batches)
Esempio n. 11
0
def insert_predictions(cfg, predictions):
    with cluster(cfg) as c:
        s = session(cfg, c)
        st = '''INSERT INTO {keyspace}.prediction 
                    (cx, cy, px, py, sday, eday, pday, prob) 
                VALUES 
                    (?, ?, ?, ?, ?, ?, ?, ?)'''.format(
            keyspace=cfg['cassandra_keyspace'])

        stmt = s.prepare(st)

        chunks = partition_all(cfg['cassandra_batch_size'], predictions)

        batches = []

        for chunk in chunks:

            batch = BatchStatement(batch_type=BatchType.UNLOGGED)

            for c in chunk:

                batch.add(stmt, [
                    c['cx'], c['cy'], c['px'], c['py'], c['sday'], c['eday'],
                    c['pday'], c['prob']
                ])
            batches.append(batch)

        return [s.execute(b) for b in batches]
Esempio n. 12
0
def async_requests(
    url_payload: List[Tuple[str, Optional[MutableMapping[str, Any]]]],
    read: str,
    request: str = "GET",
    max_workers: int = 8,
) -> List[Union[str, MutableMapping[str, Any], bytes]]:
    """Send async requests.

    This function is based on
    `this <https://github.com/HydrologicEngineeringCenter/data-retrieval-scripts/blob/master/qpe_async_download.py>`__
    script.

    Parameters
    ----------
    url_payload : list of tuples
        A list of URLs and payloads as a tuple.
    read : str
        The method for returning the request; binary, json, and text.
    request : str, optional
        The request type; GET or POST, defaults to GET.
    max_workers : int, optional
        The maximum number of async processes, defaults to 8.

    Returns
    -------
    list
        A list of responses
    """
    chunked_urls = tlz.partition_all(max_workers, url_payload)

    results = (asyncio.get_event_loop().run_until_complete(
        _async_session(c, read, request)) for c in chunked_urls)
    return list(tlz.concat(results))
Esempio n. 13
0
    def __init__(self, path, number_of_columns, rowspaces, page_spaces,
                 rows_in_page):
        self._path = path
        self._number_of_columns = number_of_columns
        self._rowspaces = rowspaces
        self._page_spaces = page_spaces
        self._rows_in_page = rows_in_page

        self._cols = range(self._number_of_columns)
        total_width = 90
        width = total_width // self._number_of_columns

        file_list = filter_jpg(path)
        calc = xcoord(number_of_columns=self._number_of_columns, width=width)
        self._left_shifts = list(map(calc, self._cols))

        # partitions list of files into tuples with len == number_of_columns
        # so each row will contain 5 files, if number_of_columns == 5
        # [(file1, file2, ... , file5), (file6, ... , file10), ...]
        each_row = cytoolz.partition_all(self._number_of_columns, file_list)

        # each page has `rows_in_page` rows. every row is grouped with another.
        # [(row1, row2), (row3, row4), ...]
        # where row1 == (file1, file2, ...)
        self._pages_list = cytoolz.partition(self._rows_in_page,
                                             each_row,
                                             pad=None)
        self._pages_list = list(self._pages_list)

        assert len(self._pages_list[0]) <= len(
            self._rowspaces) == self._rows_in_page
        assert len(self._pages_list) <= len(self._page_spaces)
    def make_embeddings(self, load=False, filename=None):
        """ Embed all the sentences as ELMo embeddings
        
            Args:
                load: if True, load from file using the given filename
                filename: string of filename for saved ELMo embeddings, if None,
                    loader defaults to 'elmo_embeddings.npy' to load from
        
        """
        if load:
            self.load_elmo_embeddings(filename)
            return

        # Get the ELMo model
        url = "https://tfhub.dev/google/elmo/2"
        embed_model = hub.Module(url)

        all_embeddings = []
        #         if self.use_gpu:
        #             device = '/gpu:0'
        #         else:
        #             device = '/cpu:0'
        #         with tf.device(device):
        for sentence_block in cytoolz.partition_all(150, self.sentences):
            embeddings = embed_model(sentence_block,
                                     signature="default",
                                     as_dict=True)["default"]
            with tf.Session() as sess:
                sess.run(tf.global_variables_initializer())
                sess.run(tf.tables_initializer())
                x = sess.run(embeddings)
            all_embeddings.extend(x)

        self.elmo_embeddings = np.array(all_embeddings)
        self.reduced_embeddings = self.elmo_embeddings
Esempio n. 15
0
    def _drainage_area_sqm(self, siteinfo: pd.DataFrame,
                           freq: str) -> pd.Series:
        """Get drainage area of the stations."""
        area = siteinfo[["site_no", "drain_sqkm"]].copy()
        if area["drain_sqkm"].isna().any():
            sids = area[area["drain_sqkm"].isna()].site_no
            queries = [{
                "parameterCd": "00060",
                "siteStatus": "all",
                "outputDataTypeCd": freq,
                "sites": ",".join(s),
            } for s in tlz.partition_all(1500, sids)]
            info = self.get_info(queries, expanded=True)

            def get_idx(ids: List[str]) -> Tuple[pd.Index, pd.Index]:
                return info.site_no.isin(ids), area.site_no.isin(ids)

            i_idx, a_idx = get_idx(sids)
            # Drainage areas in info are in sq mi and should be converted to sq km
            area.loc[a_idx, "drain_sqkm"] = info.loc[
                i_idx, "contrib_drain_area_va"] * 0.38610
            if area["drain_sqkm"].isna().any():
                sids = area[area["drain_sqkm"].isna()].site_no
                i_idx, a_idx = get_idx(sids)
                area.loc[a_idx,
                         "drain_sqkm"] = info.loc[i_idx,
                                                  "drain_area_va"] * 0.38610

        if area["drain_sqkm"].isna().all():
            raise DataNotAvailable("drainage")
        return area.set_index("site_no").drain_sqkm * 1e6
Esempio n. 16
0
    def _get_streamflow(self, sids: Sequence[str], start_dt: str, end_dt: str,
                        freq: str, kwargs: Dict[str, str]) -> pd.DataFrame:
        """Convert json to dataframe."""
        payloads = [{
            "sites": ",".join(s),
            "startDT": start_dt,
            "endDT": end_dt,
            **kwargs,
        } for s in tlz.partition_all(1500, sids)]
        resp = ar.retrieve_json(
            [f"{self.url}/{freq}"] * len(payloads),
            [{
                "params": p
            } for p in payloads],
            expire_after=self.expire_after,
            disable=self.disable_caching,
        )

        def get_site_id(site_cd: Dict[str, str]) -> str:
            """Get site id."""
            return f"{site_cd['agencyCode']}-{site_cd['value']}"

        r_ts = {
            get_site_id(t["sourceInfo"]["siteCode"][0]):
            t["values"][0]["value"]
            for r in resp for t in r["value"]["timeSeries"]
            if len(t["values"][0]["value"]) > 0
        }
        if len(r_ts) == 0:
            raise DataNotAvailable("discharge")

        def to_df(col: str, values: Dict[str, Any]) -> pd.DataFrame:
            discharge = pd.DataFrame.from_records(values,
                                                  exclude=["qualifiers"],
                                                  index=["dateTime"])
            discharge.index = pd.to_datetime(discharge.index,
                                             infer_datetime_format=True)
            if discharge.index.tz is None:
                tz = resp[0]["value"]["timeSeries"][0]["sourceInfo"][
                    "timeZoneInfo"]
                tz_dict = {
                    "CST": "US/Central",
                    "MST": "US/Mountain",
                    "PST": "US/Pacific",
                    "EST": "US/Eastern",
                }
                time_zone = tz_dict.get(
                    tz["defaultTimeZone"]["zoneAbbreviation"],
                    tz["defaultTimeZone"]["zoneAbbreviation"],
                )

                discharge.index = discharge.index.tz_localize(time_zone)
            discharge.index = discharge.index.tz_convert("UTC")
            discharge.columns = [col]
            return discharge

        qobs = pd.concat([to_df(s, t) for s, t in r_ts.items()], axis=1)
        # Convert cfs to cms
        return qobs.astype("float64") * 0.028316846592
def batch_train_custom_cumulate(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None):
    if(gpu_id == 0 and torch.cuda.is_available()):
        print("Using cuda")
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
        cudnn.benchmark = True
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    #print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
        model = TextClassifier(nlp, labels, long_text=long_text,
                               low_data=len(examples) < 1000)
    else:
        print("build your customized model")
        pt_model = FastText(vocab_size=684831, emb_dim = 300).cuda()
        optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001)
        criterion = nn.BCELoss()
        model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion)
    examples = DB.get_dataset(dataset)
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    start_time = datetime.now()
    if len(evals) > 0:
        model.init_eval(evals)
    interval = 100
    for fac in np.arange(interval,len(examples)+interval,interval):
        examples_fac = examples[:fac]
        batch_number = examples_fac/batch_size
        for i in range(n_iter):
            if shuffle:
                print("it's shuffling")
                random.shuffle(examples)
            batch_idx = 0
            loss = 0
            for batch in cytoolz.partition_all(batch_size,
                                               tqdm.tqdm(examples, leave=False)):
                batch = list(batch)
                loss += model.update(batch)
                batch_idx += 1
            acc = model.evaluate(evals)     
            print_('Time:[{0} seconds], process: [{1}/{2}], Epoch: [{3}/{4}], step: [{5}/{6}], Loss: {7},Acc:{8}'.format(
               end_time.seconds,fac, len(examples)//interval, i+1, n_iter, batch_idx+1, len(examples_fac)//batch_size, loss/batch_number, acc))
    return acc
Esempio n. 18
0
def process(reader, writer, tokenizer):
    with mp.Pool() as pool, tqdm(desc='tokenizing') as pbar:
        for lines in partition_all(BUF, reader):
            for tokens in pool.imap(tokenize(tokenizer),
                                    lines,
                                    chunksize=CHUNK):
                write(writer, tokens)
            pbar.update(len(lines))
 def pipe(self, docs, batch_size=1000, n_threads=n_threads):
     for minibatch in cytoolz.partition_all(batch_size, docs):
         minibatch = list(minibatch)
         for doc in minibatch:
             Xs = get_char_features(self.char_index, doc.sents, self.max_length)
             ys = self._model.predict(Xs)
             doc.user_data['toxics'] = ys.mean(axis=0)
             yield doc
Esempio n. 20
0
    def _tokenise(self):
        """Partition the dataset and run tokeniser in parallel."""

        partitions = partition_all(self._batch_size, self._line_iter())
        executor = Parallel(n_jobs=self._n_jobs)
        tasks = (delayed(subprocess)(NLP, part)
                 for part in partitions)
        return flatten(executor(tasks))
Esempio n. 21
0
 async def request_receipts(self, headers: List[BlockHeader]) -> None:
     for batch in partition_all(eth.MAX_RECEIPTS_FETCH, headers):
         peer = await self.peer_pool.get_random_peer()
         cast(ETHPeer, peer).sub_proto.send_get_receipts(
             [header.hash for header in batch])
         self.logger.debug("Requesting %d block receipts to %s", len(batch),
                           peer)
         now = time.time()
         for header in batch:
             self._pending_receipts[header.receipt_root] = (header, now)
Esempio n. 22
0
 def _inner_preduce(x):
     """Splits the sequence into pairs and possibly one
     singlet, on each of which `fn` is performed to create
     a new sequence.
     """
     if len(x) < 3:
         return _sfn(x)
     paired_x = partition_all(2, x)
     new_x = tuple(pool.map(_sfn, paired_x))
     return _inner_preduce(new_x)
Esempio n. 23
0
 def _request_block_parts(
         self, target_td: int, headers: List[BlockHeader],
         request_func: Callable[[ETHPeer, List[BlockHeader]], None]) -> int:
     peers = self.peer_pool.get_peers(target_td)
     if not peers:
         raise NoEligiblePeers()
     length = math.ceil(len(headers) / len(peers))
     batches = list(partition_all(length, headers))
     for peer, batch in zip(peers, batches):
         request_func(cast(ETHPeer, peer), batch)
     return len(batches)
Esempio n. 24
0
 async def request_bodies(self, headers: List[BlockHeader]) -> None:
     for batch in partition_all(eth.MAX_BODIES_FETCH, headers):
         peer = await self.peer_pool.get_random_peer()
         cast(ETHPeer, peer).sub_proto.send_get_block_bodies(
             [header.hash for header in batch])
         self.logger.debug("Requesting %d block bodies to %s", len(batch),
                           peer)
         now = time.time()
         for header in batch:
             key = (header.transaction_root, header.uncles_hash)
             self._pending_bodies[key] = (header, now)
Esempio n. 25
0
def merge_sentences(docs, n_sents):
    counter = 0
    merged = []
    for group in partition_all(n_sents, docs):
        group = list(group)
        first = group.pop(0)
        to_extend = first['paragraphs'][0]['sentences']
        for sent in group[1:]:
            to_extend.extend(sent['paragraphs'][0]['sentences'])
        merged.append(first)
    return merged
Esempio n. 26
0
def merge_sentences(docs, n_sents):
    counter = 0
    merged = []
    for group in partition_all(n_sents, docs):
        group = list(group)
        first = group.pop(0)
        to_extend = first['paragraphs'][0]['sentences']
        for sent in group[1:]:
            to_extend.extend(sent['paragraphs'][0]['sentences'])
        merged.append(first)
    return merged
Esempio n. 27
0
    async def _download_receipts(self,
                                 target_td: int,
                                 all_headers: Tuple[BlockHeader, ...]) -> None:
        """
        Downloads and persists the receipts for the given set of block headers.
        Receipts are requested from all peers in equal sized batches.
        """
        # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same
        # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284),
        # so we do this to avoid requesting the same receipts multiple times.
        headers = tuple(unique(
            (header for header in all_headers if not _is_receipts_empty(header)),
            key=operator.attrgetter('receipt_root'),
        ))

        while headers:
            # split the remaining headers into equal sized batches for each peer.
            peers = cast(Tuple[ETHPeer, ...], self.peer_pool.get_peers(target_td))
            if not peers:
                raise NoEligiblePeers(
                    "No connected peers have the receipts we need for td={0}".format(target_td)
                )
            batch_size = math.ceil(len(headers) / len(peers))
            batches = tuple(partition_all(batch_size, headers))

            # issue requests to all of the peers and wait for all of them to respond.
            requests = tuple(
                self._get_receipts(peer, batch)
                for peer, batch
                in zip(peers, batches)
            )
            responses = await self.wait(asyncio.gather(
                *requests,
                loop=self.get_event_loop(),
            ))

            # extract the returned receipt data and the headers for which we
            # are still missing receipts.
            all_receipt_bundles, all_missing_headers = zip(*responses)
            receipt_bundles = tuple(concat(all_receipt_bundles))
            headers = tuple(concat(all_missing_headers))

            if len(receipt_bundles) == 0:
                continue

            # process all of the returned receipts, storing their trie data
            # dicts in the database
            receipts, trie_roots_and_data_dicts = zip(*receipt_bundles)
            trie_roots, trie_data_dicts = zip(*trie_roots_and_data_dicts)
            for trie_data in trie_data_dicts:
                await self.wait(self.db.coro_persist_trie_data_dict(trie_data))

        self.logger.debug("Got receipts batch for %d headers", len(all_headers))
Esempio n. 28
0
 def pipe(self, docs, batch_size=1000):
     for minibatch in cytoolz.partition_all(batch_size, docs):
         minibatch = list(minibatch)
         sentences = []
         for doc in minibatch:
             sentences.extend(doc.sents)
         Xs = get_features(sentences, self.max_length)
         ys = self._model.predict(Xs)
         for sent, label in zip(sentences, ys):
             sent.doc.sentiment += label - 0.5
         for doc in minibatch:
             yield doc
 def pipe(self, docs, batch_size=1000, n_threads=2):
     for minibatch in cytoolz.partition_all(batch_size, docs):
         minibatch = list(minibatch)
         sentences = []
         for doc in minibatch:
             sentences.extend(doc.sents)
         Xs = get_features(sentences, self.max_length)
         ys = self._model.predict(Xs)
         for sent, label in zip(sentences, ys):
             sent.doc.sentiment += label - 0.5
         for doc in minibatch:
             yield doc
Esempio n. 30
0
 def pipe(self, docs, batch_size=1000, n_threads=2):
     for minibatch in cytoolz.partition_all(batch_size, docs):
         minibatch = list(minibatch)
         sentences = []
         for doc in minibatch:
             sentences.extend(doc.sents)
         Xs = get_features(sentences, self.max_length)
         ys = self._model.predict(Xs)
         for sent, label in zip(sentences, ys):
             sent.doc.user_data['output_labels'] = numpy.where(
                 label == numpy.amax(label), 1, 0)
         for doc in minibatch:
             yield doc
Esempio n. 31
0
def find_number_map(x, y):
    if not (x >= 1 and y >= 1):
        return False
    # 5 = number of cols; 6 = number of rows, 30 images
    number_map = list(cytoolz.partition_all(5, range(30)))

    try:
        # coordinates are 1-based index
        number = number_map[y - 1][x - 1]
    except IndexError:
        print('Invalid number!\n')
        return False
    return number
Esempio n. 32
0
    async def _download_block_bodies(
        self, target_td: int, all_headers: Tuple[BlockHeader, ...]
    ) -> Dict[Tuple[Hash32, Hash32], BlockBody]:
        """
        Downloads and persists the block bodies for the given set of block headers.
        Block bodies are requested from all peers in equal sized batches.
        """
        headers = tuple(header for header in all_headers
                        if not _is_body_empty(header))
        block_bodies_by_key: Dict[Tuple[Hash32, Hash32], BlockBody] = {}

        while headers:
            # split the remaining headers into equal sized batches for each peer.
            peers = cast(Tuple[ETHPeer, ...],
                         self.peer_pool.get_peers(target_td))
            if not peers:
                raise NoEligiblePeers(
                    "No connected peers have the block bodies we need for td={0}"
                    .format(target_td))
            batch_size = math.ceil(len(headers) / len(peers))
            batches = tuple(partition_all(batch_size, headers))

            # issue requests to all of the peers and wait for all of them to respond.
            requests = tuple(
                self._get_block_bodies(peer, batch)
                for peer, batch in zip(peers, batches))
            responses = await self.wait(
                asyncio.gather(
                    *requests,
                    loop=self.get_event_loop(),
                ))

            # extract the returned block body data and the headers for which we
            # are still missing block bodies.
            all_block_body_bundles, all_missing_headers = zip(*responses)

            for (body, (tx_root, trie_data_dict),
                 uncles_hash) in concat(all_block_body_bundles):
                await self.wait(
                    self.db.coro_persist_trie_data_dict(trie_data_dict))

            block_bodies_by_key = merge(
                block_bodies_by_key,
                {(transaction_root, uncles_hash): block_body
                 for block_body, (transaction_root, trie_dict_data),
                 uncles_hash in concat(all_block_body_bundles)})
            headers = tuple(concat(all_missing_headers))

        self.logger.debug("Got block bodies batch for %d headers",
                          len(all_headers))
        return block_bodies_by_key
Esempio n. 33
0
File: csv.py Progetto: Casolt/blaze
    def _extend(self, rows):
        mode = 'ab' if PY2 else 'a'
        newline = dict() if PY2 else dict(newline='')
        dialect = keyfilter(to_csv_kwargs.__contains__, self.dialect)
        should_write_newline = self.last_char() != os.linesep
        with csvopen(self, mode=mode, **newline) as f:
            # we have data in the file, append a newline
            if should_write_newline:
                f.write(os.linesep)

            for df in map(partial(bz.into, pd.DataFrame),
                          partition_all(self.chunksize, iter(rows))):
                df.to_csv(f, index=False, header=None, encoding=self.encoding,
                          **dialect)
Esempio n. 34
0
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20),
               single_pass=True):
    """k-means for very large sets of documents.

    See kmeans for documentation. Differs from that function in that it does
    not computer tf-idf or LSA, and fetches the documents in a streaming
    fashion, so they don't need to be held in memory. It does not do random
    restarts.

    If the option single_pass is set to False, the documents are visited
    twice: once to fit a k-means model, once to determine their label in
    this model.
    """
    from sklearn.cluster import MiniBatchKMeans
    from sklearn.feature_extraction.text import HashingVectorizer

    docs = tosequence(docs)

    vectorizer = HashingVectorizer(input="content",
                                   n_features=n_features, norm="l2")
    kmeans = MiniBatchKMeans(n_clusters=k)

    labels = []
    for batch in toolz.partition_all(batch_size, docs):
        batch = map(fetch, batch)
        batch = vectorizer.transform(batch)
        y = kmeans.fit_predict(batch)
        if single_pass:
            labels.extend(y.tolist())

    if not single_pass:
        for batch in toolz.partition_all(batch_size, docs):
            batch = map(fetch, batch)
            batch = vectorizer.transform(batch)
            labels.extend(kmeans.predict(batch).tolist())

    return _group_clusters(docs, labels)
def batch_train_custom(dataset, input_model=None, output_model=None, lang='en',
                factor=1, dropout=0.2, n_iter=1, batch_size=10,
                eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False):
    if(n_iter ==1):
        print("one pass mode")
    print("batch_size",batch_size)
    #print(factor,type(factor))
    DB = connect()
    print_ = get_print(silent)
    random.seed(0)
    if input_model is not None:
        nlp = spacy.load(input_model, disable=['ner'])
        print_('\nLoaded model {}'.format(input_model))
        model = TextClassifier(nlp, labels, long_text=long_text,
                               low_data=len(examples) < 1000)
    else:
        print("build your customized model")
        pt_model = FastText(vocab_size=684831, emb_dim = 300)
        optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001)
        criterion = nn.BCELoss()
        model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion)
    examples = DB.get_dataset(dataset)
    if eval_id:
        evals = DB.get_dataset(eval_id)
        print_("Loaded {} evaluation examples from '{}'"
               .format(len(evals), eval_id))
    examples = examples[:int(len(examples) * factor)]
    print_(printers.trainconf(dropout, n_iter, batch_size, factor,
                              len(examples)))
    if len(evals) > 0:
        print_(printers.tc_update_header())
    best_acc = {'accuracy': 0}
    best_model = None
    for i in range(n_iter):
        if shuffle:
            random.shuffle(examples)
        batch_idx = 1
        for batch in cytoolz.partition_all(batch_size,
                                           tqdm.tqdm(examples, leave=False)):
            #print(j)
            batch = list(batch)
            loss = model.update(batch)
            if len(evals) > 0 and batch_idx % (4 * batch_size) == 0:
                acc = model.evaluate(evals)     
                #print_(printers.tc_update(batch_idx, loss, acc))
                print('Epoch: [{0}/{1}], Step: [{2}/{3}], Loss: {4}, Validation Acc:{5}'.format( 
                   i+1, n_iter, batch_idx, len(examples)//batch_size, loss, acc))
            batch_idx += 1
    return acc
def count_sentences(texts_in, batch_size, name):
    t0 = time.clock()
    N = max(1, len(texts_in) / 5)
    n = 0
    k = 0
    for minibatch in cytoolz.partition_all(batch_size, texts_in):
        texts = list(minibatch)
        text_sents = sentence_cache.sent_id_pipe(texts)
        k += len(text_sents)
        n += sum(len(sent) for sent in text_sents)
        if k % N == 0 or k == len(texts):
            dt = max(time.clock() - t0, 1.0)
            print('##^^%5s=%7d sents=%8d dt=%4.1f sec %2.1f sents/doc %3.1f docs/sec %3.1f sents/sec' %
                (name, k, n, dt, n / k, k / dt, n / dt))
    return n
Esempio n. 37
0
    def _get_batch(self, bs: int, x, y=None):
        y_x_pairs = zip(y, x) if y is not None else enumerate(x)
        for batch in cytoolz.partition_all(bs, y_x_pairs):
            batch_y, batch_x = more_itertools.unzip(batch)
            X, Y = list(batch_x), list(batch_y)

            if sparse.issparse(Y[0]):
                Y = sparse.vstack(Y)
            elif isinstance(Y[0], np.ndarray):
                Y = np.vstack(Y)
            if sparse.issparse(X[0]):
                X = sparse.vstack(X)
            elif isinstance(X[0], np.ndarray):
                X = np.vstack(X)
            yield X, Y
Esempio n. 38
0
    def __iter__(self):
        self.order = []
        user_ids = list(set(self.tweets.values_list(self.key, flat=True)))

        for user_pks in partition_all(self.step, user_ids):
            queryset = self.tweets.filter(user_id__in=user_pks)
            for key_id, tweet_set in groupby(queryset.only('text', self.key).order_by(self.key), key=self.key_func):
                bows = []

                for tweet in tweet_set:
                    keywords = self.characterizer.tokenize(tweet.text)
                    bow = self.dictionary.doc2bow(set(keywords), allow_update=False)
                    bows.append(dict(bow))

                self.order.append(key_id)
                yield list(merge_with(sum, *bows).items())
Esempio n. 39
0
    def handle(self, *args, **options):
        step = options.get('step', 10000)

        limit_date = datetime.datetime.now() - datetime.timedelta(days=options.get('days', 7))
        print(limit_date)

        queryset = Tweet.objects.filter(datetime__lt=limit_date)
        while queryset.exists():
            tweet_ids = [t.pk for t in queryset[:step]]
            Tweet.objects.filter(pk__in=tweet_ids).delete()
            print('deleted', len(tweet_ids))

        user_ids = User.objects.annotate(Count('author')).filter(author__count=0).values_list('pk', flat=True)
        for pks in partition_all(step, user_ids):
            User.objects.filter(pk__in=pks).delete()
            print('deleted', len(pks), 'users')
Esempio n. 40
0
def concatenate_tweets(tweets, dictionary, characterizer, step=10000):
    if not tweets.count():
        return []

    all_bows = []

    for tweets in partition_all(step, queryset_iterator(tweets.only('text'), chunksize=step)):
        bows = []

        for tweet in tweets:
            keywords = characterizer.tokenize(tweet.text)
            bow = dictionary.doc2bow(set(keywords), allow_update=False)
            bows.append(dict(bow))

        all_bows.append(merge_with(sum, *bows))

    return list(merge_with(sum, all_bows).items())
Esempio n. 41
0
File: csv.py Progetto: pgnepal/blaze
    def _extend(self, rows):
        mode = 'ab' if PY2 else 'a'
        newline = dict() if PY2 else dict(newline='')
        dialect = keyfilter(to_csv_kwargs.__contains__, self.dialect)
        should_write_newline = self.last_char() != os.linesep
        f = self.open(self.path, mode, **newline)

        try:
            # we have data in the file, append a newline
            if should_write_newline:
                f.write(os.linesep)

            for df in map(partial(bz.into, pd.DataFrame),
                          partition_all(self.chunksize, iter(rows))):
                df.to_csv(f, index=False, header=None, **dialect)
        finally:
            try:
                f.close()
            except AttributeError:
                pass
 def pipe(self, docs, batch_size=1000, n_threads=-1):
     interval = 10
     t0 = time.clock()
     i = 0
     k = 0
     for minibatch in cytoolz.partition_all(batch_size, docs):
         minibatch = list(minibatch)
         for doc in minibatch:
             Xs = get_features(doc.sents, self.max_length)
             ys = self._model.predict(Xs)
             if i >= interval:
                 xprint('SentimentAnalyser.pipe: %4d docs %5d sents %.1f sec' % (i, k, time.clock() - t0))
                 interval *= 2
             for method in self.methods:
                 y = reduce(ys, method=method)
                 assert len(y.shape) == 1 and len(y) == ys.shape[1], (ys.shape, y.shape)
                 doc.user_data[method] = y
             yield doc
             i += 1
             k += ys.shape[0]
     xprint('SentimentAnalyser.pipe: %4d docs %5d sents %.1f sec TOTAL' % (i, k, time.clock() - t0))
Esempio n. 43
0
 def partition_all(self, n):
     return self.__class__(self.__class__(p) for p in cytoolz.partition_all(n, self))