def test_buffered_writer_wrapper_works(self): """ Ensure that we can wrap a smart_open gcs stream in a BufferedWriter, which passes a memoryview object to the underlying stream in python >= 2.7 """ expected = u'не думай о секундах свысока' with smart_open.gcs.Writer(BUCKET_NAME, WRITE_BLOB_NAME) as fout: with io.BufferedWriter(fout) as sub_out: sub_out.write(expected.encode('utf-8')) with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME), 'rb') as fin: with io.TextIOWrapper(fin, encoding='utf-8') as text: actual = text.read() self.assertEqual(expected, actual)
def _publish_last_updated(self): """Write the timestamp when file of the dataset were last modified to GCS.""" last_updated_path = (f"api/{self.api_version}/tables/{self.dataset}/" f"{self.table}/{self.version}/last_updated") output_file = f"gs://{self.target_bucket}/{last_updated_path}" logging.info(f"Write last_updated to {output_file}") with smart_open.open(output_file, "w") as fout: last_updated = self.last_updated.strftime("%Y-%m-%d %H:%M:%S") fout.write(json.dumps(last_updated)) # set Content-Type to json so that timestamp is displayed in the browser blob = self.storage_client.get_bucket( self.target_bucket).get_blob(last_updated_path) blob.content_type = "application/json" blob.patch()
def transit_groupby_csv(session, params, s3_filename, key, agg, transport_params, chunk_size=1e6): """ ### https://maxhalford.github.io/blog/streaming-groupbys-in-pandas-for-big-datasets/ ### pandas function to reduce memory usage. Data needs to be SORTED by a key and is processed by chunk_size batches. this can lead to hanging or orphan keys which is also handled. In our case data is grouped by lrimoshipno and sorted by movementdatetime :param session: boto3 session to access athena data :param params: boto3 parameter dictionary :param s3_filename: file to process after running get_data :param key: data has to be sorted by this key :param agg: data processing function :param chunk_size: number of rows per batch for processing :return: dataframe with potential port coordinates """ chunks = pd.read_csv(open('s3://' + params['bucket'] + '/' + params['path'] + '/' + s3_filename, transport_params=transport_params), chunksize=chunk_size, parse_dates=['movementdatetime']) results = [] orphans = pd.DataFrame() for chunk in tqdm(chunks): # Add the previous orphans to the chunk chunk = pd.concat((orphans, chunk)) # Determine which rows are orphans last_val = chunk[key].iloc[-1] is_orphan = chunk[key] == last_val # Put the new orphans aside chunk, orphans = chunk[~is_orphan], chunk[is_orphan] # Perform the aggregation and store the results result = agg(chunk) results.append(result) return pd.concat(results)
def run(self): """Starts extracting data from the source files Loops over each source file passing it to the users scrapers `self.extract` method. Passing in the source files raw content If all the sources need to be passed in and extracted at the same time, then the user may override this method to do so. """ logger.info("Start Extract", extra={'task': self.task, **self.scraper.log_extras(), 'time_started': self.time_extracted, }) for source_idx, source_file in enumerate(self._get_sources()): raw_source = None transport_params = {} if source_file.startswith('s3://'): transport_params = _get_s3_params(self.scraper, context_type='downloader') with open(source_file, 'r', transport_params=transport_params) as f: raw_source = f.read() try: extraction_tasks = self._get_extraction_tasks(raw_source, source_idx) if not extraction_tasks: continue for extraction_task in extraction_tasks: extraction_task(raw_source) except Exception as e: logger.exception(f"Extraction Failed: {e}", extra={'task': self.task, 'source_file': source_file, **self.scraper.log_extras(), **get_root_exc_log_overides(), }) logger.debug('Extract finished', extra={'task': self.task, **self.scraper.log_extras(), 'time_finished': datetime.datetime.utcnow().isoformat() + 'Z', })
def test_gcs_performance_small_reads(benchmark): initialize_bucket() ONE_MIB = 1024**2 one_megabyte_of_msgs = io.BytesIO() msg = b'\x0f' + b'0123456789abcde' # a length-prefixed "message" for _ in range(0, ONE_MIB, len(msg)): one_megabyte_of_msgs.write(msg) one_megabyte_of_msgs = one_megabyte_of_msgs.getvalue() key = _GCS_URL + '/many_reads_performance.bin' with smart_open.open(key, 'wb') as fout: fout.write(one_megabyte_of_msgs) actual = benchmark(read_length_prefixed_messages, key, 'rb', buffering=ONE_MIB) assert actual == one_megabyte_of_msgs
async def _fetch_data_from_s3(bucket, key, context): """ Stream data from S3 bucket. Create batches of size MAX_PAYLOAD_SIZE and create async requests from batches """ log_file_size = boto3.resource('s3').Bucket( bucket).Object(key).content_length if log_file_size > MAX_FILE_SIZE: logger.error( "The log file uploaded to S3 is larger than the supported max size of 400MB") return s3MetaData = { "invoked_function_arn": context.invoked_function_arn, "s3_bucket_name": bucket } log_file_url = "s3://{}/{}".format(bucket, key) async with aiohttp.ClientSession() as session: log_batches = [] batch_request = [] batch_counter = 1 log_batch_size = 0 start = time.time() with open(log_file_url, encoding='utf-8') as log_lines: for index, log in enumerate(log_lines): log_batch_size += sys.getsizeof(log) if index % 500 == 0: logger.debug(f"index: {index}") log_batches.append(log) if log_batch_size > (MAX_BATCH_SIZE * BATCH_SIZE_FACTOR): logger.debug(f"sending batch: {batch_counter}") data = {"context": s3MetaData, "entry": log_batches} batch_request.append(create_log_payload_request(data, session)) if len(batch_request) >= REQUEST_BATCH_SIZE: await asyncio.gather(*batch_request) batch_request = [] log_batches = [] log_batch_size = 0 batch_counter += 1 data = {"context": s3MetaData, "entry": log_batches} batch_request.append(create_log_payload_request(data, session)) logger.info("Sending data to NR logs.....") output = await asyncio.gather(*batch_request) end = time.time() logger.debug(f"time elapsed to send to NR Logs: {end - start}")
def main(): parser = argparse.ArgumentParser() parser.add_argument('--uuids', help='.txt file with hca-util submission uuids (uuid only). 1 uuid per line') parser.add_argument('--num_reads', default= 1000, help='number of reads to test') args = parser.parse_args() # Check if path ends with / or not and retrieve bucket key bucket_name = 'hca-util-upload-area' uuids = pd.read_csv(args.uuids,header=None) uuids = list(uuids[0]) for uuid in uuids: s3 = boto3.resource('s3') my_bucket = s3.Bucket(bucket_name) keys = ['s3://hca-util-upload-area/' + str(s3_object.key) for s3_object in my_bucket.objects.all()] filenames = [key for key in keys if uuid in key] filenames = filenames[1:] filenames = [file for file in filenames if '.fastq.gz' in file] my_dict = {} for filename in filenames: my_dict[filename] = {} with smart_open.open(filename) as f: count = 0 len_seqs = [] records = SeqIO.parse(f, 'fastq') for record in records: if count < args.num_reads: len_seqs.append(len(str(record.seq))) count += 1 else: break len_uniq = list(set(len_seqs)) for uniq in len_uniq: num = len_seqs.count(uniq) my_dict[filename].update({uniq: num}) data = pd.DataFrame.from_dict(my_dict, orient='index') out_file = uuid + "_read_lengths.txt" data.to_csv(out_file,sep="\t") print("Done processing uuid: %s" % (uuid))
def __init__(self, filename, validate_file=False, limit=None): """ filename: the vocabulary file. It is a flat text file with one (normalized) token per line. In addition, the file should also contain the special tokens <S>, </S>, <UNK> (case sensitive). Can be None. limit: process only the first <limit> words from the file; can be useful at inference (we assume the vocabulary is sorted by frequency). """ self._id_to_word = [] self._word_to_id = {} self._unk = -1 self._bos = -1 self._eos = -1 if filename: vocab_source = open(filename, 'r') # Loading vocabulary from file else: logging.info( "No vocabulary file provided; using special tokens only.") vocab_source = ["<S>", "</S>", "<UNK>"] # Creating a toy vocabulary ourselves idx = 0 for line in vocab_source: word_name = line.strip() if word_name == '<S>': self._bos = idx elif word_name == '</S>': self._eos = idx elif word_name == '<UNK>': self._unk = idx if word_name == '!!!MAXTERMID': continue self._id_to_word.append(word_name) self._word_to_id[word_name] = idx idx += 1 if idx == limit: break logging.info( f"We will cache the vocabulary of {len(self._id_to_word)} tokens.") # check to ensure file has special tokens if validate_file: if self._bos == -1 or self._eos == -1 or self._unk == -1: raise ValueError("Ensure the vocabulary file has " "<S>, </S>, <UNK> tokens")
def upload_from_url(url, s3_key, on_stream_opened=None): bucket = app.config['LOCH_S3_BUCKET'] s3_url = build_s3_url(s3_key) with requests.get(url, stream=True) as response: if response.status_code != 200: app.logger.error( f'Received unexpected status code, aborting S3 upload ' f'(status={response.status_code}, body={response.text}, key={s3_key} url={url})' ) raise ConnectionError( f'Response {response.status_code}: {response.text}') if on_stream_opened: on_stream_opened(response.headers) try: s3_upload_args = { 'ServerSideEncryption': app.config['LOCH_S3_ENCRYPTION'] } if s3_url.endswith('.gz'): s3_upload_args.update({ 'ContentEncoding': 'gzip', 'ContentType': 'text/plain', }) session = get_session() # smart_open needs to be told to ignore the .gz extension, or it will smartly attempt to double-compress it. with smart_open.open( s3_url, 'wb', ignore_ext=True, transport_params=dict( session=session, multipart_upload_kwargs=s3_upload_args), ) as s3_out: for chunk in response.iter_content(chunk_size=1024): s3_out.write(chunk) except (ClientError, ConnectionError, ValueError) as e: app.logger.error( f'Error on S3 upload: source_url={url}, bucket={bucket}, key={s3_key}, error={e}' ) raise e s3_response = get_client().head_object(Bucket=bucket, Key=s3_key) if s3_response: app.logger.info( f'S3 upload complete: source_url={url}, bucket={bucket}, key={s3_key}' ) return s3_response
def find_secrets(): # create empty dataframe df = pd.DataFrame(columns=[ 'cik', 'date', 'company_name', 'form_type', 'filename', 'total', 'has_secret' ] + secret_list + ['has_protection'] + protect_list) progress = 0 client = connect_s3() # iterate through files df_mf = pd.read_csv('master_file_list_subset_clean.csv') for i in tqdm(df_mf.index): f = df_mf.at[i, 'Filename'].replace('edgar/data/', '') with open(f's3://sec-filings-v2/{f}', transport_params={'client': client}) as f_in: contents = f_in.read() sec_header, ten_k_body = text_preprocessing(contents) fyear = extract_fyear(sec_header) sic = extract_sic(sec_header) text_list = tokenize(ten_k_body) # search for the keywords counts_dict = get_count(text_list) counts_dict['cik'] = df_mf.at[i, 'CIK'] counts_dict['date'] = df_mf.at[i, 'Date Filed'] counts_dict['company_name'] = df_mf.at[i, 'Company Name'] counts_dict['filename'] = f counts_dict['form_type'] = df_mf.at[i, 'Form Type'] df = df.append(counts_dict, ignore_index=True) if progress % 100 == 0 and progress >= 100: df_temp = df.sort_values(by=['cik', 'date']) csv_to_s3(df_temp, client, f'keywords_temp_{progress}.csv') if progress != 100: client.delete_object(Bucket='10k-output', Key=f'keywords_temp_{progress-100}.csv') progress += 1 df = df.sort_values(by=['cik', 'date']) csv_to_s3(df, client, f'keywords_final.csv') stop_ec2()
def read_file(fpath: str, **kwargs) -> str: """ Read file with `smart_open` from file path. Parameters ----------- fpath: str File path. kwargs: optional Other `smart_open` support params. Returns -------- data string of the file. """ with smart_open.open(fpath, **kwargs) as f: data = f.read() return data
def __iter__(self): jieba.enable_parallel(8) for filename in self.file_list: with open(self.root_path + filename, encoding='utf-8') as f: for line in f: words = self._process(line) if not words or len( words ) < 2: # less than 2 words won't contain 2 cities continue words, cities = self._retrieve_cities(words) # get unique cities cities = list(set(cities)) if len(cities ) < 2: # less than 2 cities won't composite a link continue # yield {'words': self.dictionary.doc2bow(words), 'cities': cities} yield {'words': words, 'cities': cities}
def restore_spilled_objects(self, object_refs: List[ObjectRef], url_with_offset_list: List[str]): for i in range(len(object_refs)): object_ref = object_refs[i] url_with_offset = url_with_offset_list[i].decode() # Retrieve the information needed. parsed_result = parse_url_with_offset(url_with_offset) base_url = parsed_result.base_url offset = parsed_result.offset # Read a part of the file and recover the object. with open(base_url, "rb") as f: f.seek(offset) metadata_len = int.from_bytes(f.read(8), byteorder="little") buf_len = int.from_bytes(f.read(8), byteorder="little") self._size_check(metadata_len, buf_len, parsed_result.size) metadata = f.read(metadata_len) # read remaining data to our buffer self._put_object_to_store(metadata, buf_len, f, object_ref)
def process(self, something): clear_data = [] with open(self.input_path) as fin: data = fin.read() products_list = ast.literal_eval(data) for prod in products_list: product_id = prod.get("id") product_name = prod.get("name") product_price = prod.get("price") created_at = prod.get("created_at") currency = prod.get("currency") clear_data.append([ product_id, product_name, product_price, currency, created_at ]) print(clear_data) logging.getLogger().setLevel(logging.INFO) yield clear_data
def __build_sentences(corpus_path, data_path): sentences_path = '%s/sentences.txt' % data_path if not os.path.exists(sentences_path): # 读入语料并做分词,然后保存分词文件 sentences = [] with open(corpus_path, 'r') as corpus_file: with open(sentences_path, 'w') as sentences_file: reader = csv.reader(corpus_file) index = 0 for row in reader: index += 1 sentences.append(' '.join(jieba.cut(row[1]))) if index % 2000 == 0: sentences_file.write('\n'.join(sentences)) sentences.clear() return word2vec.LineSentence(smart_open.open(sentences_path))
def add_tfidf_records( s3_filename='s3://hanks-bda-2020-01/data-output/tfidf/000000_0'): res = boto3.resource('dynamodb') table = res.Table('tfidf') create_table() with open(s3_filename, 'rb') as fin: i = 1 for line in fin: strvalue = line.decode('utf-8').strip() doc_id, term, value = strvalue.split('\x01') table.put_item(Item={ 'term': term, 'doc_id': doc_id, 'value': Decimal(value) }) if (i % 1000) == 0: print(f"I {i}") i += 1
def unpickle(fname): """Load object from `fname`, using smart_open so that `fname` can be on S3, HDFS, compressed etc. Parameters ---------- fname : str Path to pickle file. Returns ------- object Python object loaded from `fname`. """ with open(fname, 'rb') as f: return _pickle.load( f, encoding='latin1' ) # needed because loading from S3 doesn't support readline()
def get_paths_from_csv(_fnfn, path_key=PATH_KEY, path_ext=PATH_EXT, path_prefix='', path_suffix='', sep='\t'): paths = [] #with codecs.open(_fnfn,encoding='utf-8') as pf: if not path_key: path_key = DEFAULT_PATH_KEY with open(_fnfn) as pf: reader = csv.DictReader(pf, delimiter=sep) for dx in reader: path = dx.get(path_key, '') if not path: continue if path_prefix: path = os.path.join(path_prefix, path) if path_suffix: path = path + path_suffix if path: paths += [path] return paths
def parallel_read(old, fname): old_val, treedef = jax.tree_flatten(old) with open(fname, "rb") as f: buf = f.read() f_io = io.BytesIO(buf) loaded = np.load(f_io) new_vals = [] for i in loaded: new_vals.append(loaded[i]) for o, n in zip(new_vals, old_val): assert o.shape == n.shape, "Incompatible checkpoint" if o.dtype == np.dtype('V2'): o.dtype = jnp.bfloat16 return jax.tree_unflatten(treedef, new_vals)
def open_s3(uri: str, *args: Any, **kwargs: Any) -> smart_open.open: """Stream an s3 key for read / write operations. This is a wrapper around smart_open.open which allows us to fine-tune access control for testing. """ transport_params = { 'resource_kwargs': { 'endpoint_url': _ENDPOINT_URL, }, 'ExtraArgs': { 'ServerSideEncryption': 'AES256' } } return smart_open.open(uri, transport_params=transport_params, *args, **kwargs)
def write_file_to_storage(self, records): #Get parameters to pass to the smart_open open function transport_params = { 'session': self.session, 'resource_kwargs': { 'endpoint_url': self.endpoint_url, } } #Construct the storage URI storage_uri = 's3://%s/%s.tsv.gz' % (self.s3_bucket, self.s3_key) #Write records to S3 with smart_open.open(storage_uri, 'w', transport_params=transport_params) as fout: file_writer = csv.writer(fout, delimiter='\t', lineterminator='\n') file_writer.writerows(records)
def output_samples(self, filename, n=None): with torch.no_grad(): for name, samples in (('cat', self.categorical_samples), ('cont', self.continuous_samples)): if samples is None: continue grid_imgs = self.trainer.target_g(samples) grid_filename = os.path.join( os.path.dirname(filename), f'info_{name}_{os.path.basename(filename)}') nrow = samples.shape[1] with smart_open.open(grid_filename, 'wb') as output_file: torchvision.utils.save_image(grid_imgs, output_file, nrow=nrow, normalize=True, range=(-1, 1), format='png')
def wem_export_years(): for year in [2020, 2019]: json_envelope = [] energy = wem_energy_year(year) market_value = wem_market_value_year(year) json_envelope = energy + market_value year_path = BASE_EXPORT + f"/wem/energy/daily/{year}.json" with open( year_path, "w", transport_params=dict(multipart_upload_kwargs=UPLOAD_ARGS), ) as fh: json.dump(json_envelope, fh, cls=NemEncoder)
def mergesort(sorted_filenames, columns, nway=2, tmp_dir='', encoding='utf-8'): """Merge these 2 sorted csv files into a single output file """ merge_n = 0 while len(sorted_filenames) > 1: merge_filenames, sorted_filenames = sorted_filenames[:nway], sorted_filenames[nway:] output_filename = os.path.join(tmp_dir, 'merge{}.csv'.format(merge_n)) with open(output_filename, 'w', newline='\n', encoding=encoding) as output_fp: writer = csv.writer(output_fp) merge_n += 1 rows = (yield_csv_rows(filename, columns, encoding) for filename in merge_filenames) writer.writerows(heapq.merge(*rows)) sorted_filenames.append(output_filename) for filename in merge_filenames: os.remove(filename) return sorted_filenames[0]
def _collect(self, data_dir: str) -> Iterator[str]: # type: ignore """ This function will collect the files of the given directory. If the 'suffix' field in the config is set, it will only take files matching that suffix. See :func:`~forte.data.readers.RecursiveDirectory DeserializeReader.default_configs` for the default configs. Args: data_dir: The root directory to search for the data packs. Returns: Iterator of the data pack string from the directory. """ for root, _, files in os.walk(data_dir): for file in files: if not self.configs.suffix or file.endswith( self.configs.suffix): with open(os.path.join(root, file)) as f: yield f.read()
def download_file(src: Union[str, "Pathy"], dest: Path, *, force: bool = False) -> None: """Download a file using smart_open. url (str): The URL of the file. dest (Path): The destination path. force (bool): Whether to force download even if file exists. If False, the download will be skipped. """ import smart_open if dest.exists() and not force: return None src = str(src) with smart_open.open(src, mode="rb", ignore_ext=True) as input_file: with dest.open(mode="wb") as output_file: output_file.write(input_file.read())
def test_write_03b(self): """Does writing a last chunk size equal to a multiple of the min_part_size work?""" min_part_size = 256 * 1024 smart_open_write = smart_open.gcs.Writer( BUCKET_NAME, WRITE_BLOB_NAME, min_part_size=min_part_size ) expected = b"t" * min_part_size * 2 with smart_open_write as fout: fout.write(expected) self.assertEqual(fout._current_part.tell(), 262144) self.assertEqual(fout._total_parts, 1) # read back the same key and check its content with smart_open.open("gs://{}/{}".format(BUCKET_NAME, WRITE_BLOB_NAME)) as fin: output = fin.read().encode('utf-8') self.assertEqual(output, expected)
def _open(self, path_or_uri, mode): from smart_open import open if isinstance(path_or_uri, LocalGitFile): import git return io.BytesIO( git.Repo(path_or_uri.repo_path).git.show("{}:{}".format( path_or_uri.ref, path_or_uri.path)).encode()) if isinstance(path_or_uri, SourceFile): path_or_uri = path_or_uri.get_path_or_uri() try: return open(path_or_uri, mode) except Exception as e: raise WorkflowError( "Failed to open source file {}".format(path_or_uri), e)
def _train_tokenizer(store: BaseConfig) -> spm.SentencePieceProcessor: """ Trains SentencePiece tokenizer on training data """ logging.info("Training SentencePiece tokenizer") spm.SentencePieceTrainer.Train( input=store.training_data, model_prefix=store.tokenizer_prefix, user_defined_symbols=["<n>", store.field_delimiter_token], vocab_size=store.vocab_size, hard_vocab_limit=False, max_sentence_length=store.max_line_len, character_coverage=store.character_coverage) """ spm.SentencePieceTrainer.Train( f'--input={store.training_data} ' f'--model_prefix={store.tokenizer_prefix} ' f'--user_defined_symbols=<n>,{store.field_delimiter_token} ' f'--vocab_size={store.vocab_size} ' f'--hard_vocab_limit=false ' f'--character_coverage={store.character_coverage}') """ _move_tokenizer_model(store) sp = spm.SentencePieceProcessor() logging.info(f"Loading tokenizer from: {Path(store.tokenizer_model).name}") sp.Load(store.tokenizer_model) # print sample output with open(store.training_data) as f: sample = f.readline().strip() logging.info(f"Tokenizer model vocabulary size: {len(sp)} tokens") logging.info( 'Mapping first line of training data\n\n{}\n ---- sample tokens mapped to pieces ---- > \n{}\n' .format(repr(sample), ", ".join(sp.SampleEncodeAsPieces(sample, -1, 0.1)))) logging.info( 'Mapping first line of training data\n\n{}\n ---- sample tokens mapped to int ---- > \n{}\n' .format(repr(sample), ", ".join([str(idx) for idx in sp.EncodeAsIds(sample)]))) logging.info( f"Saving SentencePiece model to {store.tokenizer_prefix}.model and {store.tokenizer_prefix}.vocab" ) return sp
def load_data(node_id, node_emb, tq_emb, qc, qc_loc, FLAGS, filename): output_X, output_R, output_y = [], [], [] n_fea = 2 * FLAGS.emb_dim + 2 with open(filename, 'r') as fp: for line in fp: data = line.strip().split('\t') queries = [data[i] for i in range(1, len(data), 3)] embs = [ get_tq_emb(node_id, node_emb, tq_emb, FLAGS, q) for q in queries ] # qembs = [ node_emb[node_id[('query', q)]] if ('query', q) in node_id else np.zeros(FLAGS.emb_dim) for q in queries] data = np.array([]) for i in range(len(queries)): # data = np.append(data, [math.log10(i + 1)]) # data = np.append(data, qembs[i]) data = np.append(data, embs[i]) data = np.append( data, embs[i] - embs[i - 1] if i > 0 else np.zeros(FLAGS.emb_dim)) for i in range(1, len(queries)): if (queries[i - 1], queries[i]) not in qc_loc: continue L = max(0, (i - FLAGS.max_len)) * n_fea R = i * n_fea X = data[L:R] if X.size < n_fea * FLAGS.max_len: X = np.append(np.zeros(n_fea * FLAGS.max_len - X.size), X) assert (X.size == n_fea * FLAGS.max_len) for c in qc[queries[i - 1]]: R = np.array([]) cemb = get_tq_emb(node_id, node_emb, tq_emb, FLAGS, c) R = np.append(R, cemb) R = np.append(R, cemb - embs[i - 1]) output_X.append(X) output_R.append(R) output_y.append(1.0 if c == queries[i] else 0.0) output_y = np.array(output_y).reshape((len(output_y), 1)) return np.array(output_X), np.array(output_R), output_y