def big_kmeans(docs, k, batch_size=1000, n_features=(2**20), single_pass=True): """k-means for very large sets of documents. See kmeans for documentation. Differs from that function in that it does not computer tf-idf or LSA, and fetches the documents in a streaming fashion, so they don't need to be held in memory. It does not do random restarts. If the option single_pass is set to False, the documents are visited twice: once to fit a k-means model, once to determine their label in this model. """ from sklearn.cluster import MiniBatchKMeans from sklearn.feature_extraction.text import HashingVectorizer docs = tosequence(docs) v = HashingVectorizer(input="content", n_features=n_features, norm="l2") km = MiniBatchKMeans(n_clusters=k) labels = [] for batch in toolz.partition_all(batch_size, docs): batch = map(fetch, docs) batch = v.transform(batch) y = km.fit_predict(batch) if single_pass: labels.extend(y.tolist()) if not single_pass: for batch in toolz.partition_all(batch_size, docs): batch = map(fetch, docs) batch = v.transform(batch) labels.extend(km.predict(batch).tolist()) return group_clusters(docs, labels)
def test_from_to_iterable(nums): nums_pl = nums nums_pl = pl.sync.from_iterable(nums_pl) nums_pl = cz.partition_all(10, nums_pl) nums_pl = pl.sync.map(sum, nums_pl) nums_pl = list(nums_pl) nums_py = nums nums_py = cz.partition_all(10, nums_py) nums_py = map(sum, nums_py) nums_py = list(nums_py) assert nums_py == nums_pl
def test_iterable_and_map(nums): nums_pl = nums nums_pl = pl.task.from_iterable(nums_pl) nums_pl = cz.partition_all(10, nums_pl) nums_pl = pl.task.map(sum, nums_pl) nums_pl = list(nums_pl) nums_py = nums nums_py = cz.partition_all(10, nums_py) nums_py = map(sum, nums_py) nums_py = list(nums_py) assert nums_py == nums_pl
async def test_from_to_iterable_2(nums: tp.List[int]): nums_pl = nums nums_pl = pl.task.from_iterable(nums_pl) nums_pl = cz.partition_all(10, nums_pl) nums_pl = pl.task.map(sum, nums_pl) nums_pl = pl.task.to_async_iterable(nums_pl) nums_pl = [x async for x in nums_pl] nums_py = nums nums_py = cz.partition_all(10, nums_py) nums_py = map(sum, nums_py) nums_py = list(nums_py) assert nums_py == nums_pl
def test_from_to_iterable(nums: tp.List[int]): nums_pl = nums nums_pl = pl.thread.from_iterable(nums_pl) nums_pl = cz.partition_all(10, nums_pl) nums_pl = pl.thread.map(sum, nums_pl) nums_pl = pl.thread.to_iterable(nums_pl) nums_pl = list(nums_pl) nums_py = nums nums_py = cz.partition_all(10, nums_py) nums_py = map(sum, nums_py) nums_py = list(nums_py) assert nums_py == nums_pl
def test_benchmark_RNN_fwd(): nO = 128 nI = 128 n_batch = 1000 batch_size = 30 seq_len = 30 lengths = numpy.random.normal(scale=10, loc=30, size=n_batch * batch_size) lengths = numpy.maximum(lengths, 1) batches = [] uniform_lengths = False for batch_lengths in partition_all(batch_size, lengths): batch_lengths = list(batch_lengths) if uniform_lengths: seq_len = max(batch_lengths) batch = [ numpy.asarray(numpy.random.uniform(0., 1., (int(seq_len), nI)), dtype='f') for _ in batch_lengths ] else: batch = [ numpy.asarray(numpy.random.uniform(0., 1., (int(seq_len), nI)), dtype='f') for seq_len in batch_lengths ] batches.append(batch) model = LSTM(nO, nI) start = timeit.default_timer() for Xs in batches: ys, bp_ys = model.begin_update(list(Xs)) #_ = bp_ys(ys) end = timeit.default_timer() n_samples = n_batch * batch_size print("--- %i samples in %s seconds (%f samples/s, %.7f s/sample) ---" % (n_samples, end - start, n_samples / (end - start), (end - start) / n_samples))
def __iter__(self): ids = self._create_ids() random.shuffle(ids) buckets = [ sorted(ids[i:i + self._bucket_size], key=self.sort_fn, reverse=True) for i in range(0, len(ids), self._bucket_size) ] # fill batches until max_token (include padding) batches = [] for bucket in buckets: max_len = 0 batch_indices = [] for indices in partition_all(self._mul, bucket): # make sure batch size is multiple of 8 ind_max = max(self._keys[i] for i in indices) ind_max = max(ind_max) # max of src/tgt max_len = max(max_len, ind_max) if max_len * (len(batch_indices) + self._mul) > self._max_tok: if not batch_indices: raise ValueError( "max_tokens too small / max_seq_len too long") batches.append(batch_indices) batch_indices = list(indices) max_len = ind_max else: batch_indices.extend(indices) if not self._droplast and batch_indices: batches.append(batch_indices) random.shuffle(batches) return iter(batches)
def test_benchmark_RNN_fwd(): nO = 128 nI = 128 n_batch = 1000 batch_size = 30 seq_len = 30 lengths = numpy.random.normal(scale=10, loc=30, size=n_batch*batch_size) lengths = numpy.maximum(lengths, 1) batches = [] uniform_lengths = False for batch_lengths in partition_all(batch_size, lengths): batch_lengths = list(batch_lengths) if uniform_lengths: seq_len = max(batch_lengths) batch = [numpy.asarray( numpy.random.uniform(0., 1., (int(seq_len), nI)), dtype='f') for _ in batch_lengths ] else: batch = [numpy.asarray( numpy.random.uniform(0., 1., (int(seq_len), nI)), dtype='f') for seq_len in batch_lengths ] batches.append(batch) model = LSTM(nO, nI) start = timeit.default_timer() for Xs in batches: ys, bp_ys = model.begin_update(list(Xs)) #_ = bp_ys(ys) end = timeit.default_timer() n_samples = n_batch * batch_size print("--- %i samples in %s seconds (%f samples/s, %.7f s/sample) ---" % (n_samples, end - start, n_samples / (end - start), (end - start) / n_samples))
def __iter__(self): ids = self._create_ids() random.shuffle(ids) buckets = [ sorted(ids[i:i + self._bucket_size], key=self._sort_fn, reverse=True) for i in range(0, len(ids), self._bucket_size) ] batches = [] for bucket in buckets: max_len = 0 batch_indices = [] # Partition the bucket into tuples of length at most self._size_mul for indices in partition_all(self._size_mul, bucket): max_len = max(max_len, max(self._lens[i] for i in indices)) # fill batches until max_token (include padding) if max_len * (len(batch_indices) + self._size_mul) > self._max_tok: if not batch_indices: raise ValueError( "max_tokens too small / max_seq_len too long") assert len(batch_indices) % self._size_mul == 0 batches.append(batch_indices) batch_indices = list(indices) else: batch_indices.extend(indices) if not self._droplast and batch_indices: batches.append(batch_indices) random.shuffle(batches) return iter(batches)
def __iter__(self): ids = self._create_ids() random.shuffle(ids) # 将整个数据集构建 len(buckets) 个 _bucket_size 大小的 bucket. # anwen hu note:in each bucket, instances are ranked according the entire sequence length (txt + num_bb) buckets = [sorted(ids[i:i+self._bucket_size], key=self._sort_fn, reverse=True) for i in range(0, len(ids), self._bucket_size)] # fill batches until max_token (include padding) batches = [] for bucket in buckets: max_len = 0 batch_indices = [] for indices in partition_all(self._size_mul, bucket): max_len = max(max_len, max(self._lens[i] for i in indices)) if (max_len * (len(batch_indices) + self._size_mul) > self._max_tok): if not batch_indices: raise ValueError( "max_tokens too small / max_seq_len too long") assert len(batch_indices) % self._size_mul == 0 batches.append(batch_indices) batch_indices = list(indices) else: batch_indices.extend(indices) if not self._droplast and batch_indices: batches.append(batch_indices) random.shuffle(batches) return iter(batches)
def insert_predictions(cfg, predictions): with cluster(cfg) as c: s = session(cfg, c) st = '''INSERT INTO {keyspace}.prediction (cx, cy, px, py, sday, eday, pday, prob) VALUES (?, ?, ?, ?, ?, ?, ?, ?)'''.format( keyspace=cfg['cassandra_keyspace']) stmt = s.prepare(st) chunks = partition_all(cfg['cassandra_batch_size'], predictions) batches = [] for chunk in chunks: batch = BatchStatement(batch_type=BatchType.UNLOGGED) for c in chunk: batch.add(stmt, [ c['cx'], c['cy'], c['px'], c['py'], c['sday'], c['eday'], c['pday'], c['prob'] ]) batches.append(batch) return [s.execute(b) for b in batches]
def async_requests( url_payload: List[Tuple[str, Optional[MutableMapping[str, Any]]]], read: str, request: str = "GET", max_workers: int = 8, ) -> List[Union[str, MutableMapping[str, Any], bytes]]: """Send async requests. This function is based on `this <https://github.com/HydrologicEngineeringCenter/data-retrieval-scripts/blob/master/qpe_async_download.py>`__ script. Parameters ---------- url_payload : list of tuples A list of URLs and payloads as a tuple. read : str The method for returning the request; binary, json, and text. request : str, optional The request type; GET or POST, defaults to GET. max_workers : int, optional The maximum number of async processes, defaults to 8. Returns ------- list A list of responses """ chunked_urls = tlz.partition_all(max_workers, url_payload) results = (asyncio.get_event_loop().run_until_complete( _async_session(c, read, request)) for c in chunked_urls) return list(tlz.concat(results))
def __init__(self, path, number_of_columns, rowspaces, page_spaces, rows_in_page): self._path = path self._number_of_columns = number_of_columns self._rowspaces = rowspaces self._page_spaces = page_spaces self._rows_in_page = rows_in_page self._cols = range(self._number_of_columns) total_width = 90 width = total_width // self._number_of_columns file_list = filter_jpg(path) calc = xcoord(number_of_columns=self._number_of_columns, width=width) self._left_shifts = list(map(calc, self._cols)) # partitions list of files into tuples with len == number_of_columns # so each row will contain 5 files, if number_of_columns == 5 # [(file1, file2, ... , file5), (file6, ... , file10), ...] each_row = cytoolz.partition_all(self._number_of_columns, file_list) # each page has `rows_in_page` rows. every row is grouped with another. # [(row1, row2), (row3, row4), ...] # where row1 == (file1, file2, ...) self._pages_list = cytoolz.partition(self._rows_in_page, each_row, pad=None) self._pages_list = list(self._pages_list) assert len(self._pages_list[0]) <= len( self._rowspaces) == self._rows_in_page assert len(self._pages_list) <= len(self._page_spaces)
def make_embeddings(self, load=False, filename=None): """ Embed all the sentences as ELMo embeddings Args: load: if True, load from file using the given filename filename: string of filename for saved ELMo embeddings, if None, loader defaults to 'elmo_embeddings.npy' to load from """ if load: self.load_elmo_embeddings(filename) return # Get the ELMo model url = "https://tfhub.dev/google/elmo/2" embed_model = hub.Module(url) all_embeddings = [] # if self.use_gpu: # device = '/gpu:0' # else: # device = '/cpu:0' # with tf.device(device): for sentence_block in cytoolz.partition_all(150, self.sentences): embeddings = embed_model(sentence_block, signature="default", as_dict=True)["default"] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) x = sess.run(embeddings) all_embeddings.extend(x) self.elmo_embeddings = np.array(all_embeddings) self.reduced_embeddings = self.elmo_embeddings
def _drainage_area_sqm(self, siteinfo: pd.DataFrame, freq: str) -> pd.Series: """Get drainage area of the stations.""" area = siteinfo[["site_no", "drain_sqkm"]].copy() if area["drain_sqkm"].isna().any(): sids = area[area["drain_sqkm"].isna()].site_no queries = [{ "parameterCd": "00060", "siteStatus": "all", "outputDataTypeCd": freq, "sites": ",".join(s), } for s in tlz.partition_all(1500, sids)] info = self.get_info(queries, expanded=True) def get_idx(ids: List[str]) -> Tuple[pd.Index, pd.Index]: return info.site_no.isin(ids), area.site_no.isin(ids) i_idx, a_idx = get_idx(sids) # Drainage areas in info are in sq mi and should be converted to sq km area.loc[a_idx, "drain_sqkm"] = info.loc[ i_idx, "contrib_drain_area_va"] * 0.38610 if area["drain_sqkm"].isna().any(): sids = area[area["drain_sqkm"].isna()].site_no i_idx, a_idx = get_idx(sids) area.loc[a_idx, "drain_sqkm"] = info.loc[i_idx, "drain_area_va"] * 0.38610 if area["drain_sqkm"].isna().all(): raise DataNotAvailable("drainage") return area.set_index("site_no").drain_sqkm * 1e6
def _get_streamflow(self, sids: Sequence[str], start_dt: str, end_dt: str, freq: str, kwargs: Dict[str, str]) -> pd.DataFrame: """Convert json to dataframe.""" payloads = [{ "sites": ",".join(s), "startDT": start_dt, "endDT": end_dt, **kwargs, } for s in tlz.partition_all(1500, sids)] resp = ar.retrieve_json( [f"{self.url}/{freq}"] * len(payloads), [{ "params": p } for p in payloads], expire_after=self.expire_after, disable=self.disable_caching, ) def get_site_id(site_cd: Dict[str, str]) -> str: """Get site id.""" return f"{site_cd['agencyCode']}-{site_cd['value']}" r_ts = { get_site_id(t["sourceInfo"]["siteCode"][0]): t["values"][0]["value"] for r in resp for t in r["value"]["timeSeries"] if len(t["values"][0]["value"]) > 0 } if len(r_ts) == 0: raise DataNotAvailable("discharge") def to_df(col: str, values: Dict[str, Any]) -> pd.DataFrame: discharge = pd.DataFrame.from_records(values, exclude=["qualifiers"], index=["dateTime"]) discharge.index = pd.to_datetime(discharge.index, infer_datetime_format=True) if discharge.index.tz is None: tz = resp[0]["value"]["timeSeries"][0]["sourceInfo"][ "timeZoneInfo"] tz_dict = { "CST": "US/Central", "MST": "US/Mountain", "PST": "US/Pacific", "EST": "US/Eastern", } time_zone = tz_dict.get( tz["defaultTimeZone"]["zoneAbbreviation"], tz["defaultTimeZone"]["zoneAbbreviation"], ) discharge.index = discharge.index.tz_localize(time_zone) discharge.index = discharge.index.tz_convert("UTC") discharge.columns = [col] return discharge qobs = pd.concat([to_df(s, t) for s, t in r_ts.items()], axis=1) # Convert cfs to cms return qobs.astype("float64") * 0.028316846592
def batch_train_custom_cumulate(dataset, input_model=None, output_model=None, lang='en', factor=1, dropout=0.2, n_iter=1, batch_size=10, eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False,gpu_id = None): if(gpu_id == 0 and torch.cuda.is_available()): print("Using cuda") os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) cudnn.benchmark = True if(n_iter ==1): print("one pass mode") print("batch_size",batch_size) #print(factor,type(factor)) DB = connect() print_ = get_print(silent) random.seed(0) if input_model is not None: nlp = spacy.load(input_model, disable=['ner']) print_('\nLoaded model {}'.format(input_model)) model = TextClassifier(nlp, labels, long_text=long_text, low_data=len(examples) < 1000) else: print("build your customized model") pt_model = FastText(vocab_size=684831, emb_dim = 300).cuda() optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001) criterion = nn.BCELoss() model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion) examples = DB.get_dataset(dataset) if eval_id: evals = DB.get_dataset(eval_id) print_("Loaded {} evaluation examples from '{}'" .format(len(evals), eval_id)) examples = examples[:int(len(examples) * factor)] print_(printers.trainconf(dropout, n_iter, batch_size, factor, len(examples))) if len(evals) > 0: print_(printers.tc_update_header()) best_acc = {'accuracy': 0} best_model = None start_time = datetime.now() if len(evals) > 0: model.init_eval(evals) interval = 100 for fac in np.arange(interval,len(examples)+interval,interval): examples_fac = examples[:fac] batch_number = examples_fac/batch_size for i in range(n_iter): if shuffle: print("it's shuffling") random.shuffle(examples) batch_idx = 0 loss = 0 for batch in cytoolz.partition_all(batch_size, tqdm.tqdm(examples, leave=False)): batch = list(batch) loss += model.update(batch) batch_idx += 1 acc = model.evaluate(evals) print_('Time:[{0} seconds], process: [{1}/{2}], Epoch: [{3}/{4}], step: [{5}/{6}], Loss: {7},Acc:{8}'.format( end_time.seconds,fac, len(examples)//interval, i+1, n_iter, batch_idx+1, len(examples_fac)//batch_size, loss/batch_number, acc)) return acc
def process(reader, writer, tokenizer): with mp.Pool() as pool, tqdm(desc='tokenizing') as pbar: for lines in partition_all(BUF, reader): for tokens in pool.imap(tokenize(tokenizer), lines, chunksize=CHUNK): write(writer, tokens) pbar.update(len(lines))
def pipe(self, docs, batch_size=1000, n_threads=n_threads): for minibatch in cytoolz.partition_all(batch_size, docs): minibatch = list(minibatch) for doc in minibatch: Xs = get_char_features(self.char_index, doc.sents, self.max_length) ys = self._model.predict(Xs) doc.user_data['toxics'] = ys.mean(axis=0) yield doc
def _tokenise(self): """Partition the dataset and run tokeniser in parallel.""" partitions = partition_all(self._batch_size, self._line_iter()) executor = Parallel(n_jobs=self._n_jobs) tasks = (delayed(subprocess)(NLP, part) for part in partitions) return flatten(executor(tasks))
async def request_receipts(self, headers: List[BlockHeader]) -> None: for batch in partition_all(eth.MAX_RECEIPTS_FETCH, headers): peer = await self.peer_pool.get_random_peer() cast(ETHPeer, peer).sub_proto.send_get_receipts( [header.hash for header in batch]) self.logger.debug("Requesting %d block receipts to %s", len(batch), peer) now = time.time() for header in batch: self._pending_receipts[header.receipt_root] = (header, now)
def _inner_preduce(x): """Splits the sequence into pairs and possibly one singlet, on each of which `fn` is performed to create a new sequence. """ if len(x) < 3: return _sfn(x) paired_x = partition_all(2, x) new_x = tuple(pool.map(_sfn, paired_x)) return _inner_preduce(new_x)
def _request_block_parts( self, target_td: int, headers: List[BlockHeader], request_func: Callable[[ETHPeer, List[BlockHeader]], None]) -> int: peers = self.peer_pool.get_peers(target_td) if not peers: raise NoEligiblePeers() length = math.ceil(len(headers) / len(peers)) batches = list(partition_all(length, headers)) for peer, batch in zip(peers, batches): request_func(cast(ETHPeer, peer), batch) return len(batches)
async def request_bodies(self, headers: List[BlockHeader]) -> None: for batch in partition_all(eth.MAX_BODIES_FETCH, headers): peer = await self.peer_pool.get_random_peer() cast(ETHPeer, peer).sub_proto.send_get_block_bodies( [header.hash for header in batch]) self.logger.debug("Requesting %d block bodies to %s", len(batch), peer) now = time.time() for header in batch: key = (header.transaction_root, header.uncles_hash) self._pending_bodies[key] = (header, now)
def merge_sentences(docs, n_sents): counter = 0 merged = [] for group in partition_all(n_sents, docs): group = list(group) first = group.pop(0) to_extend = first['paragraphs'][0]['sentences'] for sent in group[1:]: to_extend.extend(sent['paragraphs'][0]['sentences']) merged.append(first) return merged
async def _download_receipts(self, target_td: int, all_headers: Tuple[BlockHeader, ...]) -> None: """ Downloads and persists the receipts for the given set of block headers. Receipts are requested from all peers in equal sized batches. """ # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), # so we do this to avoid requesting the same receipts multiple times. headers = tuple(unique( (header for header in all_headers if not _is_receipts_empty(header)), key=operator.attrgetter('receipt_root'), )) while headers: # split the remaining headers into equal sized batches for each peer. peers = cast(Tuple[ETHPeer, ...], self.peer_pool.get_peers(target_td)) if not peers: raise NoEligiblePeers( "No connected peers have the receipts we need for td={0}".format(target_td) ) batch_size = math.ceil(len(headers) / len(peers)) batches = tuple(partition_all(batch_size, headers)) # issue requests to all of the peers and wait for all of them to respond. requests = tuple( self._get_receipts(peer, batch) for peer, batch in zip(peers, batches) ) responses = await self.wait(asyncio.gather( *requests, loop=self.get_event_loop(), )) # extract the returned receipt data and the headers for which we # are still missing receipts. all_receipt_bundles, all_missing_headers = zip(*responses) receipt_bundles = tuple(concat(all_receipt_bundles)) headers = tuple(concat(all_missing_headers)) if len(receipt_bundles) == 0: continue # process all of the returned receipts, storing their trie data # dicts in the database receipts, trie_roots_and_data_dicts = zip(*receipt_bundles) trie_roots, trie_data_dicts = zip(*trie_roots_and_data_dicts) for trie_data in trie_data_dicts: await self.wait(self.db.coro_persist_trie_data_dict(trie_data)) self.logger.debug("Got receipts batch for %d headers", len(all_headers))
def pipe(self, docs, batch_size=1000): for minibatch in cytoolz.partition_all(batch_size, docs): minibatch = list(minibatch) sentences = [] for doc in minibatch: sentences.extend(doc.sents) Xs = get_features(sentences, self.max_length) ys = self._model.predict(Xs) for sent, label in zip(sentences, ys): sent.doc.sentiment += label - 0.5 for doc in minibatch: yield doc
def pipe(self, docs, batch_size=1000, n_threads=2): for minibatch in cytoolz.partition_all(batch_size, docs): minibatch = list(minibatch) sentences = [] for doc in minibatch: sentences.extend(doc.sents) Xs = get_features(sentences, self.max_length) ys = self._model.predict(Xs) for sent, label in zip(sentences, ys): sent.doc.sentiment += label - 0.5 for doc in minibatch: yield doc
def pipe(self, docs, batch_size=1000, n_threads=2): for minibatch in cytoolz.partition_all(batch_size, docs): minibatch = list(minibatch) sentences = [] for doc in minibatch: sentences.extend(doc.sents) Xs = get_features(sentences, self.max_length) ys = self._model.predict(Xs) for sent, label in zip(sentences, ys): sent.doc.user_data['output_labels'] = numpy.where( label == numpy.amax(label), 1, 0) for doc in minibatch: yield doc
def find_number_map(x, y): if not (x >= 1 and y >= 1): return False # 5 = number of cols; 6 = number of rows, 30 images number_map = list(cytoolz.partition_all(5, range(30))) try: # coordinates are 1-based index number = number_map[y - 1][x - 1] except IndexError: print('Invalid number!\n') return False return number
async def _download_block_bodies( self, target_td: int, all_headers: Tuple[BlockHeader, ...] ) -> Dict[Tuple[Hash32, Hash32], BlockBody]: """ Downloads and persists the block bodies for the given set of block headers. Block bodies are requested from all peers in equal sized batches. """ headers = tuple(header for header in all_headers if not _is_body_empty(header)) block_bodies_by_key: Dict[Tuple[Hash32, Hash32], BlockBody] = {} while headers: # split the remaining headers into equal sized batches for each peer. peers = cast(Tuple[ETHPeer, ...], self.peer_pool.get_peers(target_td)) if not peers: raise NoEligiblePeers( "No connected peers have the block bodies we need for td={0}" .format(target_td)) batch_size = math.ceil(len(headers) / len(peers)) batches = tuple(partition_all(batch_size, headers)) # issue requests to all of the peers and wait for all of them to respond. requests = tuple( self._get_block_bodies(peer, batch) for peer, batch in zip(peers, batches)) responses = await self.wait( asyncio.gather( *requests, loop=self.get_event_loop(), )) # extract the returned block body data and the headers for which we # are still missing block bodies. all_block_body_bundles, all_missing_headers = zip(*responses) for (body, (tx_root, trie_data_dict), uncles_hash) in concat(all_block_body_bundles): await self.wait( self.db.coro_persist_trie_data_dict(trie_data_dict)) block_bodies_by_key = merge( block_bodies_by_key, {(transaction_root, uncles_hash): block_body for block_body, (transaction_root, trie_dict_data), uncles_hash in concat(all_block_body_bundles)}) headers = tuple(concat(all_missing_headers)) self.logger.debug("Got block bodies batch for %d headers", len(all_headers)) return block_bodies_by_key
def _extend(self, rows): mode = 'ab' if PY2 else 'a' newline = dict() if PY2 else dict(newline='') dialect = keyfilter(to_csv_kwargs.__contains__, self.dialect) should_write_newline = self.last_char() != os.linesep with csvopen(self, mode=mode, **newline) as f: # we have data in the file, append a newline if should_write_newline: f.write(os.linesep) for df in map(partial(bz.into, pd.DataFrame), partition_all(self.chunksize, iter(rows))): df.to_csv(f, index=False, header=None, encoding=self.encoding, **dialect)
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20), single_pass=True): """k-means for very large sets of documents. See kmeans for documentation. Differs from that function in that it does not computer tf-idf or LSA, and fetches the documents in a streaming fashion, so they don't need to be held in memory. It does not do random restarts. If the option single_pass is set to False, the documents are visited twice: once to fit a k-means model, once to determine their label in this model. """ from sklearn.cluster import MiniBatchKMeans from sklearn.feature_extraction.text import HashingVectorizer docs = tosequence(docs) vectorizer = HashingVectorizer(input="content", n_features=n_features, norm="l2") kmeans = MiniBatchKMeans(n_clusters=k) labels = [] for batch in toolz.partition_all(batch_size, docs): batch = map(fetch, batch) batch = vectorizer.transform(batch) y = kmeans.fit_predict(batch) if single_pass: labels.extend(y.tolist()) if not single_pass: for batch in toolz.partition_all(batch_size, docs): batch = map(fetch, batch) batch = vectorizer.transform(batch) labels.extend(kmeans.predict(batch).tolist()) return _group_clusters(docs, labels)
def batch_train_custom(dataset, input_model=None, output_model=None, lang='en', factor=1, dropout=0.2, n_iter=1, batch_size=10, eval_id=None, eval_split=None, long_text=False, silent=False,shuffle=False): if(n_iter ==1): print("one pass mode") print("batch_size",batch_size) #print(factor,type(factor)) DB = connect() print_ = get_print(silent) random.seed(0) if input_model is not None: nlp = spacy.load(input_model, disable=['ner']) print_('\nLoaded model {}'.format(input_model)) model = TextClassifier(nlp, labels, long_text=long_text, low_data=len(examples) < 1000) else: print("build your customized model") pt_model = FastText(vocab_size=684831, emb_dim = 300) optimizer = torch.optim.Adam(pt_model.parameters(), lr=0.001) criterion = nn.BCELoss() model = Prodigy_model(pt_model,label_size=1,optimizer=optimizer,loss=criterion) examples = DB.get_dataset(dataset) if eval_id: evals = DB.get_dataset(eval_id) print_("Loaded {} evaluation examples from '{}'" .format(len(evals), eval_id)) examples = examples[:int(len(examples) * factor)] print_(printers.trainconf(dropout, n_iter, batch_size, factor, len(examples))) if len(evals) > 0: print_(printers.tc_update_header()) best_acc = {'accuracy': 0} best_model = None for i in range(n_iter): if shuffle: random.shuffle(examples) batch_idx = 1 for batch in cytoolz.partition_all(batch_size, tqdm.tqdm(examples, leave=False)): #print(j) batch = list(batch) loss = model.update(batch) if len(evals) > 0 and batch_idx % (4 * batch_size) == 0: acc = model.evaluate(evals) #print_(printers.tc_update(batch_idx, loss, acc)) print('Epoch: [{0}/{1}], Step: [{2}/{3}], Loss: {4}, Validation Acc:{5}'.format( i+1, n_iter, batch_idx, len(examples)//batch_size, loss, acc)) batch_idx += 1 return acc
def count_sentences(texts_in, batch_size, name): t0 = time.clock() N = max(1, len(texts_in) / 5) n = 0 k = 0 for minibatch in cytoolz.partition_all(batch_size, texts_in): texts = list(minibatch) text_sents = sentence_cache.sent_id_pipe(texts) k += len(text_sents) n += sum(len(sent) for sent in text_sents) if k % N == 0 or k == len(texts): dt = max(time.clock() - t0, 1.0) print('##^^%5s=%7d sents=%8d dt=%4.1f sec %2.1f sents/doc %3.1f docs/sec %3.1f sents/sec' % (name, k, n, dt, n / k, k / dt, n / dt)) return n
def _get_batch(self, bs: int, x, y=None): y_x_pairs = zip(y, x) if y is not None else enumerate(x) for batch in cytoolz.partition_all(bs, y_x_pairs): batch_y, batch_x = more_itertools.unzip(batch) X, Y = list(batch_x), list(batch_y) if sparse.issparse(Y[0]): Y = sparse.vstack(Y) elif isinstance(Y[0], np.ndarray): Y = np.vstack(Y) if sparse.issparse(X[0]): X = sparse.vstack(X) elif isinstance(X[0], np.ndarray): X = np.vstack(X) yield X, Y
def __iter__(self): self.order = [] user_ids = list(set(self.tweets.values_list(self.key, flat=True))) for user_pks in partition_all(self.step, user_ids): queryset = self.tweets.filter(user_id__in=user_pks) for key_id, tweet_set in groupby(queryset.only('text', self.key).order_by(self.key), key=self.key_func): bows = [] for tweet in tweet_set: keywords = self.characterizer.tokenize(tweet.text) bow = self.dictionary.doc2bow(set(keywords), allow_update=False) bows.append(dict(bow)) self.order.append(key_id) yield list(merge_with(sum, *bows).items())
def handle(self, *args, **options): step = options.get('step', 10000) limit_date = datetime.datetime.now() - datetime.timedelta(days=options.get('days', 7)) print(limit_date) queryset = Tweet.objects.filter(datetime__lt=limit_date) while queryset.exists(): tweet_ids = [t.pk for t in queryset[:step]] Tweet.objects.filter(pk__in=tweet_ids).delete() print('deleted', len(tweet_ids)) user_ids = User.objects.annotate(Count('author')).filter(author__count=0).values_list('pk', flat=True) for pks in partition_all(step, user_ids): User.objects.filter(pk__in=pks).delete() print('deleted', len(pks), 'users')
def concatenate_tweets(tweets, dictionary, characterizer, step=10000): if not tweets.count(): return [] all_bows = [] for tweets in partition_all(step, queryset_iterator(tweets.only('text'), chunksize=step)): bows = [] for tweet in tweets: keywords = characterizer.tokenize(tweet.text) bow = dictionary.doc2bow(set(keywords), allow_update=False) bows.append(dict(bow)) all_bows.append(merge_with(sum, *bows)) return list(merge_with(sum, all_bows).items())
def _extend(self, rows): mode = 'ab' if PY2 else 'a' newline = dict() if PY2 else dict(newline='') dialect = keyfilter(to_csv_kwargs.__contains__, self.dialect) should_write_newline = self.last_char() != os.linesep f = self.open(self.path, mode, **newline) try: # we have data in the file, append a newline if should_write_newline: f.write(os.linesep) for df in map(partial(bz.into, pd.DataFrame), partition_all(self.chunksize, iter(rows))): df.to_csv(f, index=False, header=None, **dialect) finally: try: f.close() except AttributeError: pass
def pipe(self, docs, batch_size=1000, n_threads=-1): interval = 10 t0 = time.clock() i = 0 k = 0 for minibatch in cytoolz.partition_all(batch_size, docs): minibatch = list(minibatch) for doc in minibatch: Xs = get_features(doc.sents, self.max_length) ys = self._model.predict(Xs) if i >= interval: xprint('SentimentAnalyser.pipe: %4d docs %5d sents %.1f sec' % (i, k, time.clock() - t0)) interval *= 2 for method in self.methods: y = reduce(ys, method=method) assert len(y.shape) == 1 and len(y) == ys.shape[1], (ys.shape, y.shape) doc.user_data[method] = y yield doc i += 1 k += ys.shape[0] xprint('SentimentAnalyser.pipe: %4d docs %5d sents %.1f sec TOTAL' % (i, k, time.clock() - t0))
def partition_all(self, n): return self.__class__(self.__class__(p) for p in cytoolz.partition_all(n, self))