def glove2word2vec(glove_input_file, word2vec_output_file): """Convert `glove_input_file` in GloVe format into `word2vec_output_file` in word2vec format.""" num_lines, num_dims = get_glove_info(glove_input_file) logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file) with smart_open(word2vec_output_file, 'wb') as fout: fout.write("{0} {1}\n".format(num_lines, num_dims).encode('utf-8')) with smart_open(glove_input_file, 'rb') as fin: for line in fin: fout.write(line) return num_lines, num_dims
def testConversion(self): word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder) with smart_open(self.metadata_file, 'rb') as f: metadata = f.readlines() with smart_open(self.tensor_file, 'rb') as f: vectors = f.readlines() # check if number of words and vector size in tensor file line up with word2vec with smart_open(self.datapath, 'rb') as f: first_line = f.readline().strip() number_words, vector_size = map(int, first_line.split(b' ')) self.assertTrue(len(metadata) == len(vectors) == number_words, ('Metadata file %s and tensor file %s imply different number of rows.' % (self.metadata_file, self.tensor_file))) # grab metadata and vectors from written file metadata = [word.strip() for word in metadata] vectors = [vector.replace(b'\t', b' ') for vector in vectors] # get the originaly vector KV model orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False) # check that the KV model and tensor files have the same values key-wise for word, vector in zip(metadata, vectors): word_string = word.decode("utf8") vector_string = vector.decode("utf8") vector_array = np.array(list(map(float, vector_string.split()))) np.testing.assert_almost_equal(orig_model[word_string], vector_array, decimal=5)
def test_s3_iter_moto(self): """Are S3 files iterated over correctly?""" # a list of strings to test with expected = [b"*" * 5 * 1024**2] + [b'0123456789'] * 1024 + [b"test"] # create fake bucket and fake key s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') with smart_open.smart_open("s3://mybucket/mykey", "wb", s3_min_part_size=5 * 1024**2) as fout: # write a single huge line (=full multipart upload) fout.write(expected[0] + b'\n') # write lots of small lines for lineno, line in enumerate(expected[1:-1]): fout.write(line + b'\n') # ...and write the last line too, no newline at the end fout.write(expected[-1]) # connect to fake s3 and read from the fake key we filled above smart_open_object = smart_open.smart_open("s3://mybucket/mykey") output = [line.rstrip(b'\n') for line in smart_open_object] self.assertEqual(output, expected) # same thing but using a context manager with smart_open.smart_open("s3://mybucket/mykey") as smart_open_object: output = [line.rstrip(b'\n') for line in smart_open_object] self.assertEqual(output, expected)
def test_s3_boto(self, mock_s3_open_read, mock_boto): """Is S3 line iterator called correctly?""" # Configure the mock boto.config.get to return default host smart_open.smart_open_lib.boto.config.get.return_value = 's3.amazonaws.com' # no credentials smart_open_object = smart_open.smart_open("s3://mybucket/mykey") smart_open_object.__iter__() mock_boto.connect_s3.assert_called_with(aws_access_key_id=None, aws_secret_access_key=None, profile_name=None, host='s3.amazonaws.com') # with credential smart_open_object = smart_open.smart_open("s3://access_id:access_secret@mybucket/mykey") smart_open_object.__iter__() mock_boto.connect_s3.assert_called_with(aws_access_key_id="access_id", aws_secret_access_key="access_secret", profile_name=None, host='s3.amazonaws.com') # with credential profile smart_open_object = smart_open.smart_open("s3://mybucket/mykey", profile_name="my_credentials") smart_open_object.__iter__() mock_boto.connect_s3.assert_called_with(aws_access_key_id=None, aws_secret_access_key=None, profile_name="my_credentials", host='s3.amazonaws.com') # lookup bucket, key; call s3_iter_lines smart_open_object = smart_open.smart_open("s3://access_id:access_secret@mybucket/mykey") smart_open_object.__iter__() mock_boto.connect_s3().get_bucket.assert_called_with("mybucket") mock_boto.connect_s3().get_bucket().get_key.assert_called_with("mykey") # # TODO: this is kind of a useless assertion... # self.assertTrue(smart_open_object.__iter__.called) # with user-specified host smart_open_object = smart_open.smart_open("s3://access_id:access_secret@mybucket/mykey", host='aa.domain.com') smart_open_object.__iter__() mock_boto.connect_s3.assert_called_with(aws_access_key_id="access_id", aws_secret_access_key="access_secret", profile_name=None, host='aa.domain.com')
def test_s3_metadata_write(self): # Read local file fixture path = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt.gz') data = "" with smart_open.smart_open(path, 'rb') as fd: data = fd.read() # Create a test bucket s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') # Write data, with multipart_upload options write_stream = smart_open.smart_open( 's3://mybucket/crime-and-punishment.txt.gz', 'wb', s3_upload={ 'ContentType': 'text/plain', 'ContentEncoding': 'gzip' } ) with write_stream as fout: fout.write(data) key = s3.Object('mybucket', 'crime-and-punishment.txt.gz') self.assertIn('text/plain', key.content_type) self.assertEqual(key.content_encoding, 'gzip')
def test_s3_iter_moto(self): """Are S3 files iterated over correctly?""" # a list of strings to test with expected = [b"*" * 5 * 1024**2] + [b'0123456789'] * 1024 + [b"test"] # create fake bucket and fake key conn = boto.connect_s3() conn.create_bucket("mybucket") # lower the multipart upload size, to speed up these tests smart_open_lib.S3_MIN_PART_SIZE = 5 * 1024**2 with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout: # write a single huge line (=full multipart upload) fout.write(expected[0] + b'\n') # write lots of small lines for lineno, line in enumerate(expected[1:-1]): fout.write(line + b'\n') # ...and write the last line too, no newline at the end fout.write(expected[-1]) # connect to fake s3 and read from the fake key we filled above smart_open_object = smart_open.smart_open("s3://mybucket/mykey") output = [line.rstrip(b'\n') for line in smart_open_object] self.assertEqual(output, expected) # same thing but using a context manager with smart_open.smart_open("s3://mybucket/mykey") as smart_open_object: output = [line.rstrip(b'\n') for line in smart_open_object] self.assertEqual(output, expected)
def test_http_bz2(self): """Can open bz2 via http?""" test_string = b'Hello World Compressed.' # # TODO: why are these tests writing to temporary files? We can do the # bz2 compression in memory. # with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as infile: test_file = infile.name with smart_open.smart_open(test_file, 'wb') as outfile: outfile.write(test_string) with open(test_file, 'rb') as infile: compressed_data = infile.read() if os.path.isfile(test_file): os.unlink(test_file) responses.add(responses.GET, "http://127.0.0.1/data.bz2", body=compressed_data, stream=True) smart_open_object = smart_open.smart_open("http://127.0.0.1/data.bz2") # decompress the gzip and get the same md5 hash self.assertEqual(smart_open_object.read(), test_string)
def get_lines(glove_file_name): """Return the number of vectors and dimensions in a file in GloVe format.""" with smart_open.smart_open(glove_file_name, 'r') as f: num_lines = sum(1 for line in f) with smart_open.smart_open(glove_file_name, 'r') as f: num_dims = len(f.readline().split()) - 1 return num_lines, num_dims
def test_file(self, mock_smart_open): """Is file:// line iterator called correctly?""" prefix = "file://" full_path = '/tmp/test.txt' read_mode = "rb" smart_open_object = smart_open.smart_open(prefix+full_path, read_mode) smart_open_object.__iter__() # called with the correct path? mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1) full_path = '/tmp/test#hash##more.txt' read_mode = "rb" smart_open_object = smart_open.smart_open(prefix+full_path, read_mode) smart_open_object.__iter__() # called with the correct path? mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1) full_path = 'aa#aa' read_mode = "rb" smart_open_object = smart_open.smart_open(full_path, read_mode) smart_open_object.__iter__() # called with the correct path? mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1) short_path = "~/tmp/test.txt" full_path = os.path.expanduser(short_path)
def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False): """Convert file in Word2Vec format and writes two files 2D tensor TSV file. File "tensor_filename"_tensor.tsv contains word-vectors, "tensor_filename"_metadata.tsv contains words. Parameters ---------- word2vec_model_path : str Path to file in Word2Vec format. tensor_filename : str Prefix for output files. binary : bool, optional True if input file in binary format. """ model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary) outfiletsv = tensor_filename + '_tensor.tsv' outfiletsvmeta = tensor_filename + '_metadata.tsv' with smart_open(outfiletsv, 'wb') as file_vector, smart_open(outfiletsvmeta, 'wb') as file_metadata: for word in model.index2word: file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n')) vector_row = '\t'.join(str(x) for x in model[word]) file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n')) logger.info("2D tensor file saved to %s", outfiletsv) logger.info("Tensor metadata file saved to %s", outfiletsvmeta)
def test_s3_mode_mock(self, mock_session): """Are s3:// open modes passed correctly?""" # correct write mode, correct s3 URI smart_open.smart_open("s3://mybucket/mykey", "w", host='s3.amazonaws.com') mock_session.return_value.resource.assert_called_with( 's3', endpoint_url='http://s3.amazonaws.com' )
def test_s3_mode_mock(self, mock_write, mock_boto): """Are s3:// open modes passed correctly?""" # correct write mode, correct s3 URI smart_open.smart_open("s3://mybucket/mykey", "w") mock_boto.connect_s3.assert_called_with(aws_access_key_id=None, aws_secret_access_key=None) mock_boto.connect_s3().lookup.return_value = True mock_boto.connect_s3().get_bucket.assert_called_with("mybucket") self.assertTrue(mock_write.called)
def test_session_write_mode(self): """ Write stream should use a custom boto3.Session """ session = boto3.Session() session.resource = mock.MagicMock() smart_open.smart_open('s3://bucket/key', 'wb', s3_session=session) session.resource.assert_called_with('s3')
def write_read_assertion(self, test_file): with smart_open.smart_open(test_file, 'wb') as fout: # 'b' for binary, needed on Windows fout.write(self.TEXT.encode('utf8')) with smart_open.smart_open(test_file, 'rb') as fin: self.assertEqual(fin.read().decode('utf8'), self.TEXT) if os.path.isfile(test_file): os.unlink(test_file)
def test_gzip_write_mode(self): """Should always open in binary mode when writing through a codec.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') uri = smart_open_lib._parse_uri("s3://bucket/key.gz") with mock.patch('smart_open.s3.open') as mock_open: smart_open.smart_open("s3://bucket/key.gz", "wb") mock_open.assert_called_with('bucket', 'key.gz', 'wb')
def prepend_line(infile, outfile, line): """ Function to prepend lines using smart_open """ with smart_open.smart_open(infile, ' rb ') as old: with smart_open.smart_open(outfile, ' wb ') as new: new.write(str(line) + " \n ") for line in old: new.write(line) return outfile
def test_readline(self): """Does readline() return the correct file content?""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') test_string = u"hello žluťoučký world!\nhow are you?".encode('utf8') with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout: fout.write(test_string) reader = smart_open.smart_open("s3://mybucket/mykey", "rb") self.assertEqual(reader.readline(), u"hello žluťoučký world!\n".encode("utf-8"))
def test_s3_mode_mock(self, mock_write, mock_boto): """Are s3:// open modes passed correctly?""" # Configure the mock boto.config.get to return default host smart_open.smart_open_lib.boto.config.get.return_value = 's3.amazonaws.com' # correct write mode, correct s3 URI smart_open.smart_open("s3://mybucket/mykey", "w") mock_boto.connect_s3.assert_called_with(aws_access_key_id=None, aws_secret_access_key=None, profile_name=None, host='s3.amazonaws.com') mock_boto.connect_s3().lookup.return_value = True mock_boto.connect_s3().get_bucket.assert_called_with("mybucket") self.assertTrue(mock_write.called)
def test_read_encoding_implicit_text(self): """Should open the file with the correct encoding, implicit text read.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.txt" text = u'это знала ева, это знал адам, колеса любви едут прямо по нам' with smart_open.smart_open(key, 'wb') as fout: fout.write(text.encode('koi8-r')) with smart_open.smart_open(key, encoding='koi8-r') as fin: actual = fin.read() self.assertEqual(text, actual)
def test_write_encoding(self): """Should open the file for writing with the correct encoding.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.txt" text = u'какая боль, какая боль, аргентина - ямайка, 5-0' with smart_open.smart_open(key, 'w', encoding='koi8-r') as fout: fout.write(text) with smart_open.smart_open(key, encoding='koi8-r') as fin: actual = fin.read() self.assertEqual(text, actual)
def test_hdfs(self, mock_subprocess): """Is HDFS line iterator called correctly?""" mock_subprocess.PIPE.return_value = "test" smart_open_object = smart_open.smart_open("hdfs:///tmp/test.txt") smart_open_object.__iter__() # called with the correct params? mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE) # second possibility of schema smart_open_object = smart_open.smart_open("hdfs://tmp/test.txt") smart_open_object.__iter__() mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE)
def test_readline_eof(self): """Does readline() return empty string on EOF?""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') with smart_open.smart_open("s3://mybucket/mykey", "wb"): pass reader = smart_open.smart_open("s3://mybucket/mykey", "rb") self.assertEqual(reader.readline(), b"") self.assertEqual(reader.readline(), b"") self.assertEqual(reader.readline(), b"")
def test_s3_iter_lines(self): """Does s3_iter_lines give correct content?""" # create fake bucket and fake key s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') test_string = u"hello žluťoučký world!\nhow are you?".encode('utf8') with smart_open.smart_open("s3://mybucket/mykey", "wb") as fin: fin.write(test_string) # call s3_iter_lines and check output reader = smart_open.smart_open("s3://mybucket/mykey", "rb") output = list(reader) self.assertEqual(b''.join(output), test_string)
def test_write_bad_encoding_replace(self): """Should open the file for writing with the correct encoding.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.txt" text = u'欲しい気持ちが成長しすぎて' expected = u'?' * len(text) with smart_open.smart_open(key, 'w', encoding='koi8-r', errors='replace') as fout: fout.write(text) with smart_open.smart_open(key, encoding='koi8-r') as fin: actual = fin.read() self.assertEqual(expected, actual)
def test_read_never_returns_none(self): """read should never return None.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') test_string = u"ветер по морю гуляет..." with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout: fout.write(test_string.encode('utf8')) r = smart_open.smart_open("s3://mybucket/mykey", "rb") self.assertEqual(r.read(), test_string.encode("utf-8")) self.assertEqual(r.read(), b"") self.assertEqual(r.read(), b"")
def test_gzip_read_mode(self): """Should always open in binary mode when reading through a codec.""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = "s3://bucket/key.gz" text = u"если-б я был султан и имел трёх жён, то тройной красотой был бы окружён" with smart_open.smart_open(key, "wb") as fout: fout.write(text.encode("utf-8")) with mock.patch('smart_open.s3.open') as mock_open: smart_open.smart_open(key, "r") mock_open.assert_called_with('bucket', 'key.gz', 'rb')
def test_write_bad_encoding_replace(self): """Should replace characters that failed to encode.""" text = u'欲しい気持ちが成長しすぎて' expected = u'?' * len(text) with tempfile.NamedTemporaryFile('wb', delete=True) as infile: with smart_open.smart_open(infile.name, 'w', encoding='koi8-r', errors='replace') as fout: fout.write(text) with smart_open.smart_open(infile.name, 'r', encoding='koi8-r') as fin: actual = fin.read() self.assertEqual(expected, actual)
def test_r(self): """Reading a UTF string should work.""" text = u"физкульт-привет!" s3 = boto3.resource('s3') s3.create_bucket(Bucket='bucket') key = s3.Object('bucket', 'key') key.put(Body=text.encode('utf-8')) with smart_open.smart_open('s3://bucket/key', "rb") as fin: self.assertEqual(fin.read(), text.encode('utf-8')) with smart_open.smart_open('s3://bucket/key', "r", encoding='utf-8') as fin: self.assertEqual(fin.read(), text)
def test_readline_iter(self): """Does __iter__ return the correct file content?""" s3 = boto3.resource('s3') s3.create_bucket(Bucket='mybucket') lines = [u"всем привет!\n", u"что нового?"] with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout: fout.write("".join(lines).encode("utf-8")) reader = smart_open.smart_open("s3://mybucket/mykey", "rb") actual_lines = [l.decode("utf-8") for l in reader] self.assertEqual(2, len(actual_lines)) self.assertEqual(lines[0], actual_lines[0]) self.assertEqual(lines[1], actual_lines[1])
def test_s3_modes_moto(self): """Do s3:// open modes work correctly?""" # fake bucket and key conn = boto.connect_s3() conn.create_bucket("mybucket") test_string = b"second test" # correct write mode, correct s3 URI with smart_open.smart_open("s3://mybucket/newkey", "wb") as fin: fin.write(test_string) output = list(smart_open.smart_open("s3://mybucket/newkey", "rb")) self.assertEqual(output, [test_string])
def copy_from_s3_file_to_db(key): f = smart_open(key, 'r') cur.copy_from(f, db_table, sep=',') f.close()
def unpickle(fname): """Load pickled object from `fname`""" with smart_open(fname) as f: # Because of loading from S3 load can't be used (missing readline in smart_open) return _pickle.loads(f.read())
def __iter__(self): with smart_open.smart_open('ruwiki_text_lem_comp.txt', 'r', encoding='utf-8') as text: for line in text: yield list(tokenize(line))
def train(x_train, y_train, word_id_dict, x_dev, y_dev): # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN(FLAGS.flag_values_dict()) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) decayed_lr = tf.train.exponential_decay(FLAGS.lr, global_step, 1000, FLAGS.lr_decay, staircase=True) optimizer = tf.train.AdamOptimizer(decayed_lr) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) # Keep track of gradient values and sparsity (optional) grad_summaries = [] for g, v in grads_and_vars: if g is not None: grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g) sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g)) grad_summaries.append(grad_hist_summary) grad_summaries.append(sparsity_summary) grad_summaries_merged = tf.summary.merge(grad_summaries) # Output directory for models and summaries timestamp = str(int(time.time())) out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp)) print("Writing to {}\n".format(out_dir)) # Summaries for loss and accuracy loss_summary = tf.summary.scalar("loss", cnn.loss) acc_summary = tf.summary.scalar("accuracy", cnn.accuracy) # Train Summaries train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged]) train_summary_dir = os.path.join(out_dir, "summaries", "train") train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph) # Dev summaries dev_summary_op = tf.summary.merge([loss_summary, acc_summary]) dev_summary_dir = os.path.join(out_dir, "summaries", "dev") dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph) # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints")) checkpoint_prefix = os.path.join(checkpoint_dir, "model") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints) # Write vocabulary with smart_open.smart_open(os.path.join(out_dir, "vocab"), 'wb') as f: pickle.dump(word_id_dict, f) with smart_open.smart_open(os.path.join(out_dir, "config"), 'wb') as f: pickle.dump(FLAGS.flag_values_dict(), f) # Initialize all variables sess.run(tf.global_variables_initializer()) if FLAGS.word2vec: print("Loading W2V data...") pre_emb = KeyedVectors.load_word2vec_format(FLAGS.word2vec, binary=True) pre_emb.init_sims(replace=True) num_keys = len(pre_emb.vocab) print("loaded word2vec len ", num_keys) # initial matrix with random uniform initW = np.random.uniform(-0.25, 0.25, (FLAGS.vocab_size, FLAGS.embedding_dim)) # load any vectors from the word2vec print("init initW cnn.W in FLAG") for w in word_id_dict.keys(): arr = [] s = re.sub('[^0-9a-zA-Z]+', '', w) if w in pre_emb: arr = pre_emb[w] elif w.lower() in pre_emb: arr = pre_emb[w.lower()] elif s in pre_emb: arr = pre_emb[s] elif s.isdigit(): arr = pre_emb['1'] if len(arr) > 0: idx = word_id_dict[w] initW[idx] = np.asarray(arr).astype(np.float32) print("assigning initW to cnn. len=" + str(len(initW))) sess.run(cnn.W.assign(initW)) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: FLAGS.dropout_keep_prob } _, step, lr, summaries, loss, accuracy = sess.run( [train_op, global_step, decayed_lr, train_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, lr{:g}, acc {:g}".format(time_str, step, loss, lr, accuracy)) train_summary_writer.add_summary(summaries, step) def dev_step(x_batch, y_batch, writer=None): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, summaries, loss, accuracy = sess.run( [global_step, dev_summary_op, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) if writer: writer.add_summary(summaries, step) return accuracy # Generate batches batches = dh.batch_iter( list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs) # Training loop. For each batch... max = 0 for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % FLAGS.evaluate_every == 0: print("\nEvaluation:") accuracy = dev_step(x_dev, y_dev, writer=dev_summary_writer) print("") if accuracy > max: max = accuracy path = saver.save(sess, checkpoint_prefix, global_step=current_step) print("Saved model checkpoint to {}\n".format(path))
def read_from_s3_bucket_by_url(self, bucket, file, aws_access_key_id, aws_secret_access_key): s3url = 's3://' + aws_access_key_id + ':' + aws_secret_access_key + '@' + bucket + '/' + file for line in smart_open.smart_open(s3url): print(line)
def doc2vec(dataset_path, dataset_name, write_path, txt_path_list): # Convert text to lower-case and strip punctuation/symbols from words def normalize_text(text): norm_text = text.lower() # Replace breaks with spaces norm_text = norm_text.replace('<br />', ' ') # Pad punctuation with spaces on both sides norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text) return norm_text alldata_path = os.path.join(write_path, 'alldata-id_' + dataset_name + '.txt') if not os.path.isfile(alldata_path): # Collect & normalize test/train data print("Cleaning up dataset...") # list of the absolute paths of every text file print(" %i files" % (len(txt_path_list))) # for each file "txt" for i, txt in tqdm(enumerate(txt_path_list)): with smart_open(txt, "rb") as t: try: # "one_text" is the whole document one_text = t.read().decode("utf-8") for c in control_chars: one_text = one_text.replace(c, ' ') one_text = normalize_text(one_text) all_lines.append(one_text) except UnicodeDecodeError: # we skip this file, but we need to preserve index pos all_lines.append(" ") continue # Save to disk for instant re-use on any future runs with smart_open(alldata_path, 'wb') as f: for idx, line in enumerate(all_lines): num_line = u"_*{0} {1}\n".format(idx, line) f.write(num_line.encode("utf-8")) assert os.path.isfile(alldata_path), "alldata unavailable" print("Success, alldata is available for next steps.") #=================================================================== #=#BLOCK#=#: Read in alldata #=================================================================== # this data object class suffices as a `TaggedDocument` # (with `words` and `tags`) # plus adds other state helpful for our later evaluation/reporting with smart_open(alldata_path, 'rb', encoding='utf-8') as alldata: alldata_list = list(alldata) print("Iterating up to: ", len(alldata_list)) with smart_open(alldata_path, 'rb', encoding='utf-8') as alldata: documents = [ TaggedDocument(doc, [i]) for i, doc in tqdm(enumerate(alldata)) ] model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4) fname = get_tmpfile( os.path.join(write_path, "doc2vec_model_" + dataset_name)) model.save(fname) model = Doc2Vec.load( fname) # you can continue training with the loaded model! return
def test_shortcut(self): fpath = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt') with mock.patch('smart_open.smart_open_lib.open') as mock_open: smart_open.smart_open(fpath, 'r').read() mock_open.assert_called_with(fpath, 'r', buffering=-1)
def test_s3_upload_is_none(self, mock_session): smart_open.smart_open("s3://bucket/key", 'wb', s3_upload=None) s3_resource = mock_session.return_value.resource.return_value s3_object = s3_resource.Object.return_value s3_object.initiate_multipart_upload.assert_called()
def test_file(self, mock_smart_open): """Is file:// line iterator called correctly?""" smart_open_object = smart_open.smart_open("file:///tmp/test.txt", "rb") smart_open_object.__iter__() # called with the correct path? mock_smart_open.assert_called_with("/tmp/test.txt", "rb")
def nlp_preprocess(filepath_dict: dict, col: str, df=None, verbose: bool = True, overwrite_interim: bool = True) -> pd.DataFrame: def clean_doc(corpus): ''' generator function to read in docs from the file, and substitute and remove substrings ''' for doc in corpus: yield au_tu.remove_substrings(au_tu.clean_tokens( doc, tokens=to_replace_dict, whole_words_only=whole_words_only, ignore_case=ignore_case, ), to_remove_list=to_remove_list, whole_words_only=whole_words_only, ignore_case=ignore_case) def tokenize_entities(parsed_doc): txt = parsed_doc.text for ent in parsed_doc.ents: txt = txt[:ent.start_char] + ent.text.replace( ' ', '_') + txt[ent.end_char:] return txt def cleaned_doc_corpus(corpus): ''' generator function to use spaCy to parse docs, clean docs, tokenize named entities, and yield documents ''' for parsed_doc in nlp.pipe(clean_doc(corpus), batch_size=nlp_batch_size, n_threads=nlp_n_threads): yield tokenize_entities(parsed_doc) def punct_space_more(token): ''' helper function to eliminate tokens that are pure punctuation or whitespace or digits or only 1 character ''' return ( token.is_punct or token.is_space or token.is_digit or token.text == "'s" or token.lemma_ == '-PRON-' or # token.lemma_ == 'say' or # token.lemma_ == 'tell' or # token.lemma_ == 'be' or len(token.text) <= 1) def line_doc(filename): ''' generator function to read in docs from the file, un-escape the original line breaks in the text, and do additional cleaning ''' def hyp_to_us(doc): return re.sub(r'\b-\b', '_', doc) def remove_punct(doc): # keep: alphanumberic (w), spaces (s), single quote, underscore return re.sub(r'[^\w\s\'_]+', '', doc) # with codecs.open(filename, encoding='utf_8') as f: with smart_open(filename) as f: for doc in f: yield remove_punct(hyp_to_us(doc.decode())).replace( '\\n', '\n') def lemmatized_sentence_corpus(filename): ''' generator function to use spaCy to parse docs, lemmatize the text, and yield sentences ''' for parsed_doc in nlp.pipe(line_doc(filename), batch_size=nlp_batch_size, n_threads=nlp_n_threads): for sent in parsed_doc.sents: yield ' '.join([ token.lemma_ for token in sent if not punct_space_more(token) ]) if verbose: logger.info(f'Working on text from: {col}') # # debug - only getting from the sample dataframe here # df_phrased = df.loc[df[col].notnull(), ['tfa_master_uid', 'app_year', col]].sample(n=50).copy() df_phrased = df.loc[df[col].notnull(), ['tfa_master_uid', 'app_year', col]].copy() nlp = spacy.load('en', disable=[]) # clean text and tokenize entities if verbose: logger.info('Cleaning docs...') df_phrased[col] = list(cleaned_doc_corpus(df_phrased[col].values)) # remove 'the_' from NER tokens df_phrased[col] = df_phrased[col].apply( lambda x: ' '.join([re.sub('^the_', 'the ', y) for y in x.split()])) if verbose: logger.info('\tDone.') # create & open a new file in write mode if verbose: logger.info('Saving documents, one per line...') doc_count = 0 with codecs.open(filepath_dict['doc_txt_filepath'], 'w', encoding='utf_8') as doc_txt_file: for doc in df_phrased[[col]].apply(lambda x: ' '.join(x), axis=1).tolist(): # write the doc as a line in the new file # escape newline characters in the original doc text doc_txt_file.write(doc.replace('\n', '\\n') + '\n') doc_count += 1 if verbose: logger.info( f"Text from {doc_count:,} docs written to: {filepath_dict['doc_txt_filepath']}" ) nlp = spacy.load('en', disable=['ner']) # lemmatize and save sentences if overwrite_interim: if verbose: logger.info( f"Processing documents into unigram sentences: {filepath_dict['unigram_sentences_filepath']}" ) # with codecs.open(filepath_dict['unigram_sentences_filepath'], 'w', encoding='utf_8') as f: with smart_open(filepath_dict['unigram_sentences_filepath'], 'w') as f: for sentence in lemmatized_sentence_corpus( filepath_dict['doc_txt_filepath']): f.write(sentence + '\n') if verbose: logger.info('Done.') unigram_sentences = LineSentence( filepath_dict['unigram_sentences_filepath']) if verbose: logger.info('Unigram examples:') for unigram_sentence in it.islice(unigram_sentences, 10, 20): logger.info(u' '.join(unigram_sentence)) logger.info('=' * 30) if verbose: logger.info('Finding bigram phrases') # create the bigram model bigram = Phrases(unigram_sentences, min_count=phrase_min_count, threshold=phrase_threshold, max_vocab_size=phrase_max_vocab_size, progress_per=phrase_progress_per, scoring=phrase_scoring, common_terms=phrase_common_terms) bigram_model = Phraser(bigram) bigram_model.save(filepath_dict['bigram_model_filepath']) if verbose: logger.info( f"Saving bigram phrased sentences: {filepath_dict['bigram_sentences_filepath']}" ) # save bigram sentences with codecs.open(filepath_dict['bigram_sentences_filepath'], 'w', encoding='utf_8') as f: for unigram_sentence in unigram_sentences: bigram_sentence = u' '.join(bigram_model[unigram_sentence]) f.write(bigram_sentence + '\n') bigram_sentences = LineSentence( filepath_dict['bigram_sentences_filepath']) if verbose: logger.info('Bigram examples:') for bigram_sentence in it.islice(bigram_sentences, 10, 20): logger.info(u' '.join(bigram_sentence)) logger.info('=' * 30) if verbose: logger.info('Finding trigram phrases') # create the trigram model trigram = Phrases(bigram_sentences, min_count=phrase_min_count, threshold=phrase_threshold, max_vocab_size=phrase_max_vocab_size, progress_per=phrase_progress_per, scoring=phrase_scoring, common_terms=phrase_common_terms) trigram_model = Phraser(trigram) trigram_model.save(filepath_dict['trigram_model_filepath']) if verbose: logger.info( f"Saving trigram phrased sentences: {filepath_dict['trigram_sentences_filepath']}" ) # create trigram sentences with codecs.open(filepath_dict['trigram_sentences_filepath'], 'w', encoding='utf_8') as f: for bigram_sentence in bigram_sentences: trigram_sentence = u' '.join(trigram_model[bigram_sentence]) f.write(trigram_sentence + '\n') trigram_sentences = LineSentence( filepath_dict['trigram_sentences_filepath']) if verbose: logger.info('Trigram examples:') for trigram_sentence in it.islice(trigram_sentences, 10, 20): logger.info(u' '.join(trigram_sentence)) logger.info('=' * 30) if verbose: logger.info( f"Saving phrased docs using saved models: {filepath_dict['trigram_docs_filepath']}" ) # using saved models, write transformed text out to a new file, one doc per line with codecs.open(filepath_dict['trigram_docs_filepath'], 'w', encoding='utf_8') as f: for parsed_doc in nlp.pipe(line_doc(filepath_dict['doc_txt_filepath']), batch_size=nlp_batch_size, n_threads=nlp_n_threads): # removing punctuation and whitespace unigram_doc = [ token.lemma_ for token in parsed_doc if not punct_space_more(token) ] # apply the first-order and second-order phrase models bigram_doc = bigram_model[unigram_doc] trigram_doc = trigram_model[bigram_doc] # remove any remaining stopwords trigram_doc = [ term for term in trigram_doc if term not in nlp.Defaults.stop_words ] #extend the stop workds stop_words_extend = [ 'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come' ] trigram_doc = [ term for term in trigram_doc if term not in stop_words_extended ] # write the transformed doc as a line in the new file trigram_doc = ' '.join(trigram_doc) f.write(trigram_doc + '\n') if verbose: logger.info('Done.') # put the text back in the dataframe trigram_docs = LineSentence(filepath_dict['trigram_docs_filepath']) if len([doc for doc in trigram_docs]) == df_phrased.shape[0]: for i, doc in enumerate(trigram_docs): df_phrased.iloc[i, df_phrased.columns.get_loc(col)] = ' '.join(doc) else: raise ValueError( 'Different number of processed and original documents') # save dataframe if verbose: logger.info('Saving NLP processed data: {}'.format( filepath_dict['filepath_out'])) df_phrased.to_csv(filepath_dict['filepath_out']) return df_phrased
def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, min_count=5, max_vocab_size=0, sgd_num=100, lrate=0.001, period=10, iter=90, epsilon=0.75, dump_period=10, reg=0, alpha=100, beta=99, loss='hinge', memory=4.0, np=1, cleanup_files=False, sorted_vocab=1, ensemble=0): """ The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code available inside the wordrank directory. These files are used by the wordrank binary for training. `wr_path` is the absolute path to the Wordrank directory. `corpus_file` is the filename of the text file to be used for training the Wordrank model. Expects file to contain space-separated tokens in a single line `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. It will contain following contents: Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. `min_count` = ignore all words with total frequency lower than this. `max_vocab_size` upper bound on vocabulary size, i.e. keep the <int> most frequent words. Default is 0 for no limit. `sgd_num` number of SGD taken for each data point. `lrate` is the learning rate (too high diverges, give Nan). `period` is the period of xi variable updates `iter` = number of iterations (epochs) over the corpus. `epsilon` is the power scaling value for weighting function. `dump_period` is the period after which embeddings should be dumped. `reg` is the value of regularization parameter. `alpha` is the alpha parameter of gamma distribution. `beta` is the beta parameter of gamma distribution. `loss` = name of the loss (logistic, hinge). `memory` = soft limit for memory consumption, in GB. `np` number of copies to execute. (mpirun option) `cleanup_files` if True, delete directory and files used by this wrapper, setting to False can be useful for debugging `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes. `ensemble` = 0 (default), use ensemble of word and context vectors """ # prepare training data (cooccurrence matrix and vocab) model_dir = os.path.join(wr_path, out_name) meta_dir = os.path.join(model_dir, 'meta') os.makedirs(meta_dir) logger.info("Dumped data will be stored in '%s'", model_dir) copyfile(corpus_file, os.path.join(meta_dir, corpus_file.split('/')[-1])) vocab_file = os.path.join(meta_dir, 'vocab.txt') temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt') cooccurrence_file = os.path.join(meta_dir, 'cooccurrence') cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy') meta_file = os.path.join(meta_dir, 'meta') cmd_vocab_count = [ os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count', str(min_count), '-max-vocab', str(max_vocab_size) ] cmd_cooccurence_count = [ os.path.join(wr_path, 'glove', 'cooccur'), '-memory', str(memory), '-vocab-file', temp_vocab_file, '-window-size', str(window), '-symmetric', str(symmetric) ] cmd_shuffle_cooccurences = [ os.path.join(wr_path, 'glove', 'shuffle'), '-memory', str(memory) ] cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file] commands = [ cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences ] input_fnames = [ os.path.join(meta_dir, os.path.split(corpus_file)[-1]), os.path.join(meta_dir, os.path.split(corpus_file)[-1]), cooccurrence_file ] output_fnames = [ temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file ] logger.info("Prepare training data (%s) using glove code", ", ".join(input_fnames)) for command, input_fname, output_fname in zip(commands, input_fnames, output_fnames): with smart_open(input_fname, 'rb') as r: with smart_open(output_fname, 'wb') as w: utils.check_output(w, args=command, stdin=r) logger.info("Deleting frequencies from vocab file") with smart_open(vocab_file, 'wb') as w: utils.check_output(w, args=cmd_del_vocab_freq) with smart_open(vocab_file, 'rb') as f: numwords = sum(1 for _ in f) with smart_open(cooccurrence_shuf_file, 'rb') as f: numlines = sum(1 for _ in f) with smart_open(meta_file, 'wb') as f: meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format( numwords, numwords, numlines, cooccurrence_shuf_file.split('/')[-1], numwords, vocab_file.split('/')[-1]) f.write(meta_info.encode('utf-8')) if iter % dump_period == 0: iter += 1 else: logger.warning( "Resultant embedding will be from %d iterations rather than the input %d iterations, " "as wordrank dumps the embedding only at dump_period intervals. " "Input an appropriate combination of parameters (iter, dump_period) " "such that \"iter mod dump_period\" is zero.", iter - (iter % dump_period), iter) wr_args = { 'path': meta_dir, 'nthread': multiprocessing.cpu_count(), 'sgd_num': sgd_num, 'lrate': lrate, 'period': period, 'iter': iter, 'epsilon': epsilon, 'dump_prefix': 'model', 'dump_period': dump_period, 'dim': size, 'reg': reg, 'alpha': alpha, 'beta': beta, 'loss': loss } # run wordrank executable with wr_args cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')] for option, value in wr_args.items(): cmd.append('--%s' % option) cmd.append(str(value)) logger.info("Running wordrank binary") utils.check_output(args=cmd) # use embeddings from max. iteration's dump max_iter_dump = iter - (iter % dump_period) os.rename('model_word_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.words')) os.rename('model_context_%d.txt' % max_iter_dump, os.path.join(model_dir, 'wordrank.contexts')) model = cls.load_wordrank_model( os.path.join(model_dir, 'wordrank.words'), vocab_file, os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab, ensemble) if cleanup_files: rmtree(model_dir) return model
def save(self, d, pickle_protocol=2): handle = smart_open(self.fname, 'w') cPickle.dump(d, handle, protocol=pickle_protocol) handle.close()
def load(self): with smart_open(self.fname) as f: return cPickle.loads(f.read())
def writer_count(select_count): with smart_open(count_sentences_path, 'wb', encoding='utf-8') as fin: for name, count in select_count.items(): fin.write(name + " " + str(count) + "\n") return "重新添加完成"
def serializeObject(fileName, obj): with smart_open.smart_open(fileName, "wb") as f: pickle.dump(obj, f)
def test_file_mode_mock(self, mock_file, mock_boto): """Are file:// open modes passed correctly?""" # incorrect file mode self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "x") # correct read modes smart_open.smart_open("blah", "r") mock_file.assert_called_with("blah", "r") smart_open.smart_open("blah", "rb") mock_file.assert_called_with("blah", "rb") short_path = "~/blah" full_path = os.path.expanduser(short_path) smart_open.smart_open(short_path, "rb") mock_file.assert_called_with(full_path, "rb") # correct write modes, incorrect scheme self.assertRaises(NotImplementedError, smart_open.smart_open, "hdfs:///blah.txt", "wb+") self.assertRaises(NotImplementedError, smart_open.smart_open, "http:///blah.txt", "w") self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "wb+") # correct write mode, correct file:// URI smart_open.smart_open("blah", "w") mock_file.assert_called_with("blah", "w") smart_open.smart_open("file:///some/file.txt", "wb") mock_file.assert_called_with("/some/file.txt", "wb") smart_open.smart_open("file:///some/file.txt", "wb+") mock_file.assert_called_with("/some/file.txt", "wb+") smart_open.smart_open("file:///some/file.txt", "w+") mock_file.assert_called_with("/some/file.txt", "w+")
def deserializeObject(fileName): with smart_open.smart_open(fileName, "rb") as f: return pickle.load(f)
# Business IDs of the restaurants. ids = [ 'EAwh1OmG6t6p3nRaZOW_AA', 'pomGBqfbxcqPv14c3XH-ZQ', 'iCQpiavjjPzJ5_3gPD5Ebg', 'UBv8heCQR0RPnUQG0zkXIQ', '7m1Oa1VYV98UUuo_6i0EZg' ] w2v_corpus = [] # Documents to train word2vec on (all 6 restaurants). wmd_corpus = [] # Documents to run queries against (only one restaurant). documents = [ ] # wmd_corpus, with no pre-processing (so we can see the original documents). biz = {} with smart_open('yelp_dataset/yelp_academic_dataset_business.json', 'rb') as business_data_file: for line in business_data_file: json_line = json.loads(line) biz[json_line['business_id']] = json_line['name'] review_data_dict = {} votes = {} with smart_open('yelp_dataset/yelp_academic_dataset_review.json', 'rb') as data_file: for line in data_file: json_line = json.loads(line) if json_line['business_id'] not in ids: # Not one of the 6 restaurants. continue
help="Input file, in gloVe format (read-only).") parser.add_argument( "-o", "--output", required=True, help="Output file, in word2vec text format (will be overwritten).") args = parser.parse_args() # do the actual conversion num_lines, num_dims = glove2word2vec(args.input, args.output) logger.info('Converted model with %i vectors and %i dimensions', num_lines, num_dims) # test that the converted model loads successfully model = gensim.models.KeyedVectors.load_word2vec_format(args.output, binary=False) logger.info('Model %s successfully loaded', model) try: logger.info('testing the model....') if sys.version_info < (3,): with smart_open(args.output, 'rb') as f: seed_word1, seed_word2 = random.sample([line.split()[0] for line in f], 2) else: with smart_open(args.output, 'r') as f: seed_word1, seed_word2 = random.sample([line.split()[0] for line in f], 2) logger.info('top-10 most similar words to "%s" are: %s', seed_word1, model.most_similar(positive=[seed_word1], topn=10)) logger.info('similarity score between %s and %s: %s', seed_word1, seed_word2, model.similarity(seed_word1, seed_word2)) except: logger.error('error encountered. checking for model file creation now....') if os.path.isfile(os.path.join(args.output)): logger.info('model file %s was created but could not be loaded.', args.output) else: logger.info('model file %s creation failed. ') logger.info('please check the parameters and input file format.') raise
tableSplitList = table.split('<td style="border:1px solid saddlebrown;"') if len(tableSplitList) < 2: print 'table split went poorly %s' % url return url pdList, column_list = table_splitter(tableData) list_of_titles += ['NDIC Index'] df = pd.DataFrame(pdList, columns = listOfTitles) return df if __name__ == '__main__': password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm() top_level_url = prefix + dfScoutFilt.iloc[0].prodLink opener = secure_open(password_mgr, top_level_url) all_links = get_ndic_with_production_data() pandas_list = [] ## append dataframes to a list, then concat list for big prod table for production_link in all_links.keys(): try: datalines = get_prod_url_data(HTTP_PREFIX + production_link, opener) temp_df, column_names = split_html_table(datalines) pandas_list.append(temp_df, column_names) except: raise dfConcat = pd.concat(pandasList) with smart_open.smart_open(S3_DIR + 'production_data.p', 'wb') as fout: pickle.dump(dfConcat. fout)
def cleanLinksFromFile(inFile): inFile = normpath(inFile) # create a pool of N workers #N = cpu_count()-1 #i in range(0,N,1) #N = 1 #188 pages/sec #N = 2 # 237 pages/sec #N = 3 # #273 #N = 4 #304 #N = 5 #310 #N = 6 #320 #N = 7 #330 #N = 8 #323 #i in range(0,2*N,1) #N=1 #N=2 #N=3 #330 #N=4 #350 #N=5 #333 N=6 #355 #N=7 #356 #N=8 #340 if debug == True: N=1 start = time.clock() pool = Pool(N) heartBeat = 5000*N #heartbeat exportChunk = 100000 # lines if debug==True: exportChunk = 5 # lines finalOutput1 = [] finalOutput2 = [] finalOutput3 = [] keepGoing = True currentLine = 0 writeFileFirstTime(inFile) start = time.time() with smart_open.smart_open(inFile) as fp: while keepGoing == True: data = [] for i in range(0, 2*N, 1): try: # assume there is more data data.append(fp.next()) # build up a chunk of data currentLine += 1 except: print 'reached end of file' keepGoing = False output = pool.map(cleanParentsAndChildren, data) #output = map(parseParentsAndChildren, data) #TO DO: IF OUTPUT IS LARGE - WRITE IT TO A FILE - CREATE APPEND for line in output: finalOutput1.append(line[0]) finalOutput2.append(line[1]) finalOutput3.append(line[2]) if (len(finalOutput1)>=exportChunk): writeAllFiles(inFile,finalOutput1,finalOutput2,finalOutput3) finalOutput1 = [] finalOutput2 = [] finalOutput3 = [] if currentLine % heartBeat == 0: eTime = time.time()-start start = time.time() print 'currentLine: ', currentLine, 'est. l/sec: ', heartBeat/float(eTime) sys.stdout.flush() # if (currentLine>206350): # print currentLine #EXPORT THE DATA AND ZERO OUT THE LISTS writeAllFiles(inFile, finalOutput1, finalOutput2, finalOutput3) finalOutput1 = [] finalOutput2 = [] finalOutput3 = [] pool.close() pool.join() return
def test_no_kwargs(self, mock_session): smart_open.smart_open('s3://mybucket/mykey') mock_session.assert_called_with(profile_name=None) mock_session.return_value.resource.assert_called_with('s3')
def test_file_buffering2(self, mock_smart_open): smart_open_object = smart_open.smart_open('/tmp/somefile', 'rb', 0) smart_open_object.__iter__() # called with the correct expanded path? mock_smart_open.assert_called_with('/tmp/somefile', 'rb', buffering=0)
def test_binary(self): with mock.patch(_BUILTIN_OPEN, mock.Mock(return_value=self.bytesio)) as mock_open: with smart_open.smart_open("blah", "rb") as fin: self.assertEqual(fin.read(), self.as_bytes) mock_open.assert_called_with("blah", "rb", buffering=-1)
def test_profile(self, mock_session): smart_open.smart_open('s3://mybucket/mykey', profile_name='my_credentials') mock_session.assert_called_with(profile_name='my_credentials') mock_session.return_value.resource.assert_called_with('s3')
#!/usr/bin/python3.6 ######################### # pyhton3 file kafka_producer.py # read data from s3 and send it to kafka-cluster ######################### from kafka import KafkaProducer from smart_open import smart_open import yaml from random import randint import sys import time # read config file with open("config.yml", 'r') as ymlfile: config = yaml.load(ymlfile) fpath = config['s3_fpath_user'] + '_' + str(sys.argv[1]) + '.csv' producer = KafkaProducer(bootstrap_servers=config['bootstrap_servers_address']) dt = 4 / float(sys.argv[2]) - 1 / 600 for user in smart_open(fpath, 'rb'): # send the sessions producer.send('requests', user[:-1]) producer.flush() time.sleep(dt)
def predicting(self): import boto3 from metaflow import S3 import re import pandas as pd from nltk import tokenize import string import nltk from nltk.corpus import stopwords from smart_open import smart_open import requests nltk.download('punkt') nltk.download('stopwords') nltk.download('words') listed = [] with smart_open('s3://inputbucket1221/testdata.txt', 'r') as s3_source: Line = s3_source.readline() while Line != '': Line1 = Line.split(".") for Sentence in Line1: listed.append(Sentence) Line = s3_source.readline() L = [] for x in listed: if len(x) > 5: L.append(x) df = pd.DataFrame() df['Text'] = L print(df['Text']) def remove_punct(text): text = "".join( [char for char in text if char not in string.punctuation]) text = re.sub('[0-9]+', '', text) return text df['Textclean'] = df['Text'].apply(lambda x: remove_punct(x)) df = df.dropna() url = 'http://localhost:5000/model/predict' #myobj = {"The Model Asset Exchange is a crucial element of a developer's toolkit."} #data = {'text':['We did it!']} #x = requests.post(url, json = data) #print(x.text) senti_list = [] for x in df['Textclean']: data = {'text': [x]} res = (requests.post(url, json=data)).text.split() if float(res[4][:-1]) > float(res[6][:-3]): temp = (x, "Positive") senti_list.append(temp) elif float(res[4][:-1]) < float(res[6][:-3]): temp = (x, "Negative") senti_list.append(temp) else: temp = (x, "Neutral") senti_list.append(temp) sentiment = pd.DataFrame(senti_list, columns=['Sentence', 'Sentiment']) #sentiment.to_csv('sample.csv',index=False) sentiment.to_csv('final_sentiment.csv', index=False) with S3(s3root='s3://outputbucket1221/') as s3: s3.put_files([('final_sentiment.csv', 'final_sentiment.csv')]) self.next(self.end)
return [] # In[ ]: processed = [] chunk_size = 5000 # data = pd.DataFrame() letters = [ "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" ] for letter in letters: csv_name = "s3://millionsongprocessed/data_" + letter + ".tsv" with smart_open.smart_open(csv_name, 'w') as fout: writer = csv.writer(fout, delimiter='\t', lineterminator='\n') print(letter) for prefix in get_prefixes(letter): if (transform_s3(prefix) == []): continue processed.append(transform_s3(prefix)) if len(processed) % chunk_size == 0: # part_data = rows_to_file(processed) print("running") # data = pd.concat([data, part_data], axis=0) writer.writerows(processed) processed = []
def get_records_batch(self, hook, query_filter): # Chunks the records and streams to s3 by specified batchsize. if query_filter == '': query_filter = 'WHERE' else: query_filter = query_filter + ' AND ' count_sql_max = """ SELECT max({0}) as c FROM {1} """.format(self.primary_key, self.mssql_table) count_sql_min = """ SELECT min({0}) as c FROM {1} WHERE {0}>0""".format( self.primary_key, self.mssql_table) count_sql_max_incremental = """ SELECT max({0}) as c FROM {1} """.format( self.incremental_key, self.mssql_table) count_sql_min_incremental = """ SELECT min({0}) as c FROM {1} """.format( self.incremental_key, self.mssql_table) # if query_filter != 'WHERE': # # Remove the AND from the query filter so you're only batching # # for incremental loads within your timerange. Assumes primary_key # is incremental. # count_sql_max += query_filter.split("AND")[0] # #count_sql_min += query_filter.split("AND")[0] count = int(hook.get_pandas_df(count_sql_max)['c'][0]) min_count = int(hook.get_pandas_df(count_sql_min)['c'][0]) max_date = (hook.get_pandas_df(count_sql_max_incremental)['c'][0]) min_date = (hook.get_pandas_df(count_sql_min_incremental)['c'][0]) print(count_sql_min) print(count) print(min_count) s3_conn = BaseHook('S3').get_connection(self.s3_conn_id) s3_creds = s3_conn.extra_dejson if s3_key_suffix: s3_key = s3_key.split( ".")[0] + min_date + '-' + max_date + s3_key.split(".")[1] s3_key = '{}/{}'.format(self.s3_bucket, self.s3_key) url = 's3://{}:{}@{}'.format(s3_creds['aws_access_key_id'], s3_creds['aws_secret_access_key'], s3_key) logging.info('Initiating record retrieval in batches.') logging.info('Query'.format(count_sql_min)) logging.info(count_sql_min) logging.info('Start Date: {0}'.format(self.start)) logging.info('End Date: {0}'.format(self.end)) logging.info('smallest_number: {0}'.format(min_count)) logging.info('count: {0}'.format(count)) # Smart Open is a library for efficiently streaming large files to S3. # Streaming data to S3 here so it doesn't break the task container. # https://pypi.python.org/pypi/smart_open # Does this here because smart_open doesn't yet support an # append mode and doing it as a function was causing the file to be # overwritten every time. with smart_open.smart_open(url, 'wb') as fout: logging.info("First Row {0}".format(min_count)), logging.info("Total Rows: {0}".format(count)) logging.info("Batch Size: {0}".format(self.batchsize)) for batch in range(min_count, count, self.batchsize): query = \ """ SELECT * FROM {table} {query_filter} {primary_key} >= {batch} AND {primary_key} < {batch_two}; """.format(count=count, table=self.mssql_table, primary_key=self.primary_key, query_filter=query_filter, batch=batch, batch_two=batch + self.batchsize) logging.info(query) # Perform query and convert returned tuple to list results = list(hook.get_records(query)) logging.info( 'Successfully performed query for batch {0}-{1}.'.format( batch, (batch + self.batchsize))) results = [ dict([k.lower(), str(v)] if v is not None else [k, v] for k, v in i.items()) for i in results ] results = '\n'.join([json.dumps(i) for i in results]) # Write the results to bytes. results = results.encode('utf-8') logging.info("Uploading!") fout.write(results)
writer = avro.datafile.DataFileWriter(foutd, avro.io.DatumWriter(), schema) for ll, row in enumerate(dictRes): writer.append(row) writer.close() # # The above two functions appear to work identically. # write_avro = write_avro_context_manager with open('local.avro', 'wb') as foutd: logging.critical('writing to %r', foutd) write_avro(foutd) with smart_open.smart_open('local-so.avro', 'wb') as foutd: logging.critical('writing to %r', foutd) write_avro(foutd) subprocess.check_call(['diff', 'local.avro', 'local-so.avro']) print('sanity check OK') def split_s3_url(url): parsed = urlparse.urlparse(url) return parsed.netloc, parsed.path[1:] def read_avro(fin): reader = avro.datafile.DataFileReader(fin, avro.io.DatumReader()) return list(reader)