コード例 #1
1
ファイル: glove2word2vec.py プロジェクト: abs51295/gensim
def glove2word2vec(glove_input_file, word2vec_output_file):
    """Convert `glove_input_file` in GloVe format into `word2vec_output_file` in word2vec format."""
    num_lines, num_dims = get_glove_info(glove_input_file)
    logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file)
    with smart_open(word2vec_output_file, 'wb') as fout:
        fout.write("{0} {1}\n".format(num_lines, num_dims).encode('utf-8'))
        with smart_open(glove_input_file, 'rb') as fin:
            for line in fin:
                fout.write(line)
    return num_lines, num_dims
コード例 #2
0
    def testConversion(self):
        word2vec2tensor(word2vec_model_path=self.datapath, tensor_filename=self.output_folder)

        with smart_open(self.metadata_file, 'rb') as f:
            metadata = f.readlines()

        with smart_open(self.tensor_file, 'rb') as f:
            vectors = f.readlines()

        # check if number of words and vector size in tensor file line up with word2vec
        with smart_open(self.datapath, 'rb') as f:
            first_line = f.readline().strip()

        number_words, vector_size = map(int, first_line.split(b' '))
        self.assertTrue(len(metadata) == len(vectors) == number_words,
            ('Metadata file %s and tensor file %s imply different number of rows.'
                % (self.metadata_file, self.tensor_file)))

        # grab metadata and vectors from written file
        metadata = [word.strip() for word in metadata]
        vectors = [vector.replace(b'\t', b' ') for vector in vectors]

        # get the originaly vector KV model
        orig_model = KeyedVectors.load_word2vec_format(self.datapath, binary=False)

        # check that the KV model and tensor files have the same values key-wise
        for word, vector in zip(metadata, vectors):
            word_string = word.decode("utf8")
            vector_string = vector.decode("utf8")
            vector_array = np.array(list(map(float, vector_string.split())))
            np.testing.assert_almost_equal(orig_model[word_string], vector_array, decimal=5)
コード例 #3
0
    def test_s3_iter_moto(self):
        """Are S3 files iterated over correctly?"""
        # a list of strings to test with
        expected = [b"*" * 5 * 1024**2] + [b'0123456789'] * 1024 + [b"test"]

        # create fake bucket and fake key
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='mybucket')

        with smart_open.smart_open("s3://mybucket/mykey", "wb", s3_min_part_size=5 * 1024**2) as fout:
            # write a single huge line (=full multipart upload)
            fout.write(expected[0] + b'\n')

            # write lots of small lines
            for lineno, line in enumerate(expected[1:-1]):
                fout.write(line + b'\n')

            # ...and write the last line too, no newline at the end
            fout.write(expected[-1])

        # connect to fake s3 and read from the fake key we filled above
        smart_open_object = smart_open.smart_open("s3://mybucket/mykey")
        output = [line.rstrip(b'\n') for line in smart_open_object]
        self.assertEqual(output, expected)

        # same thing but using a context manager
        with smart_open.smart_open("s3://mybucket/mykey") as smart_open_object:
            output = [line.rstrip(b'\n') for line in smart_open_object]
            self.assertEqual(output, expected)
コード例 #4
0
    def test_s3_boto(self, mock_s3_open_read, mock_boto):
        """Is S3 line iterator called correctly?"""
        # Configure the mock boto.config.get to return default host
        smart_open.smart_open_lib.boto.config.get.return_value = 's3.amazonaws.com'

        # no credentials
        smart_open_object = smart_open.smart_open("s3://mybucket/mykey")
        smart_open_object.__iter__()
        mock_boto.connect_s3.assert_called_with(aws_access_key_id=None, aws_secret_access_key=None, profile_name=None, host='s3.amazonaws.com')

        # with credential
        smart_open_object = smart_open.smart_open("s3://access_id:access_secret@mybucket/mykey")
        smart_open_object.__iter__()
        mock_boto.connect_s3.assert_called_with(aws_access_key_id="access_id", aws_secret_access_key="access_secret", profile_name=None, host='s3.amazonaws.com')

        # with credential profile
        smart_open_object = smart_open.smart_open("s3://mybucket/mykey", profile_name="my_credentials")
        smart_open_object.__iter__()
        mock_boto.connect_s3.assert_called_with(aws_access_key_id=None, aws_secret_access_key=None, profile_name="my_credentials", host='s3.amazonaws.com')

        # lookup bucket, key; call s3_iter_lines
        smart_open_object = smart_open.smart_open("s3://access_id:access_secret@mybucket/mykey")
        smart_open_object.__iter__()
        mock_boto.connect_s3().get_bucket.assert_called_with("mybucket")
        mock_boto.connect_s3().get_bucket().get_key.assert_called_with("mykey")
        #
        # TODO: this is kind of a useless assertion...
        #
        self.assertTrue(smart_open_object.__iter__.called)

        # with user-specified host
        smart_open_object = smart_open.smart_open("s3://access_id:access_secret@mybucket/mykey", host='aa.domain.com')
        smart_open_object.__iter__()
        mock_boto.connect_s3.assert_called_with(aws_access_key_id="access_id", aws_secret_access_key="access_secret", profile_name=None, host='aa.domain.com')
コード例 #5
0
    def test_s3_metadata_write(self):
        # Read local file fixture
        path = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt.gz')
        data = ""
        with smart_open.smart_open(path, 'rb') as fd:
            data = fd.read()

        # Create a test bucket
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='mybucket')

        # Write data, with multipart_upload options
        write_stream = smart_open.smart_open(
            's3://mybucket/crime-and-punishment.txt.gz', 'wb',
            s3_upload={
                'ContentType': 'text/plain',
                'ContentEncoding': 'gzip'
            }
        )
        with write_stream as fout:
            fout.write(data)

        key = s3.Object('mybucket', 'crime-and-punishment.txt.gz')
        self.assertIn('text/plain', key.content_type)
        self.assertEqual(key.content_encoding, 'gzip')
コード例 #6
0
    def test_s3_iter_moto(self):
        """Are S3 files iterated over correctly?"""
        # a list of strings to test with
        expected = [b"*" * 5 * 1024**2] + [b'0123456789'] * 1024 + [b"test"]

        # create fake bucket and fake key
        conn = boto.connect_s3()
        conn.create_bucket("mybucket")
        # lower the multipart upload size, to speed up these tests
        smart_open_lib.S3_MIN_PART_SIZE = 5 * 1024**2
        with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
            # write a single huge line (=full multipart upload)
            fout.write(expected[0] + b'\n')

            # write lots of small lines
            for lineno, line in enumerate(expected[1:-1]):
                fout.write(line + b'\n')

            # ...and write the last line too, no newline at the end
            fout.write(expected[-1])

        # connect to fake s3 and read from the fake key we filled above
        smart_open_object = smart_open.smart_open("s3://mybucket/mykey")
        output = [line.rstrip(b'\n') for line in smart_open_object]
        self.assertEqual(output, expected)

        # same thing but using a context manager
        with smart_open.smart_open("s3://mybucket/mykey") as smart_open_object:
            output = [line.rstrip(b'\n') for line in smart_open_object]
            self.assertEqual(output, expected)
コード例 #7
0
    def test_http_bz2(self):
        """Can open bz2 via http?"""
        test_string = b'Hello World Compressed.'
        #
        # TODO: why are these tests writing to temporary files?  We can do the
        # bz2 compression in memory.
        #
        with tempfile.NamedTemporaryFile('wb', suffix='.bz2', delete=False) as infile:
            test_file = infile.name

        with smart_open.smart_open(test_file, 'wb') as outfile:
            outfile.write(test_string)

        with open(test_file, 'rb') as infile:
            compressed_data = infile.read()

        if os.path.isfile(test_file):
            os.unlink(test_file)

        responses.add(responses.GET, "http://127.0.0.1/data.bz2",
                      body=compressed_data, stream=True)
        smart_open_object = smart_open.smart_open("http://127.0.0.1/data.bz2")

        # decompress the gzip and get the same md5 hash
        self.assertEqual(smart_open_object.read(), test_string)
コード例 #8
0
ファイル: convert.py プロジェクト: jroakes/glove-to-word2vec
def get_lines(glove_file_name):
    """Return the number of vectors and dimensions in a file in GloVe format."""
    with smart_open.smart_open(glove_file_name, 'r') as f:
        num_lines = sum(1 for line in f)
    with smart_open.smart_open(glove_file_name, 'r') as f:
        num_dims = len(f.readline().split()) - 1
    return num_lines, num_dims
コード例 #9
0
    def test_file(self, mock_smart_open):
        """Is file:// line iterator called correctly?"""
        prefix = "file://"
        full_path = '/tmp/test.txt'
        read_mode = "rb"
        smart_open_object = smart_open.smart_open(prefix+full_path, read_mode)
        smart_open_object.__iter__()
        # called with the correct path?
        mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1)

        full_path = '/tmp/test#hash##more.txt'
        read_mode = "rb"
        smart_open_object = smart_open.smart_open(prefix+full_path, read_mode)
        smart_open_object.__iter__()
        # called with the correct path?
        mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1)

        full_path = 'aa#aa'
        read_mode = "rb"
        smart_open_object = smart_open.smart_open(full_path, read_mode)
        smart_open_object.__iter__()
        # called with the correct path?
        mock_smart_open.assert_called_with(full_path, read_mode, buffering=-1)

        short_path = "~/tmp/test.txt"
        full_path = os.path.expanduser(short_path)
コード例 #10
0
def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
    """Convert file in Word2Vec format and writes two files 2D tensor TSV file.

    File "tensor_filename"_tensor.tsv contains word-vectors, "tensor_filename"_metadata.tsv contains words.

    Parameters
    ----------
    word2vec_model_path : str
        Path to file in Word2Vec format.
    tensor_filename : str
        Prefix for output files.
    binary : bool, optional
        True if input file in binary format.

    """
    model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=binary)
    outfiletsv = tensor_filename + '_tensor.tsv'
    outfiletsvmeta = tensor_filename + '_metadata.tsv'

    with smart_open(outfiletsv, 'wb') as file_vector, smart_open(outfiletsvmeta, 'wb') as file_metadata:
        for word in model.index2word:
            file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
            vector_row = '\t'.join(str(x) for x in model[word])
            file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n'))

    logger.info("2D tensor file saved to %s", outfiletsv)
    logger.info("Tensor metadata file saved to %s", outfiletsvmeta)
コード例 #11
0
    def test_s3_mode_mock(self, mock_session):
        """Are s3:// open modes passed correctly?"""

        # correct write mode, correct s3 URI
        smart_open.smart_open("s3://mybucket/mykey", "w", host='s3.amazonaws.com')
        mock_session.return_value.resource.assert_called_with(
            's3', endpoint_url='http://s3.amazonaws.com'
        )
コード例 #12
0
ファイル: test_smart_open.py プロジェクト: salilb/smart_open
 def test_s3_mode_mock(self, mock_write, mock_boto):
     """Are s3:// open modes passed correctly?"""
     # correct write mode, correct s3 URI
     smart_open.smart_open("s3://mybucket/mykey", "w")
     mock_boto.connect_s3.assert_called_with(aws_access_key_id=None, aws_secret_access_key=None)
     mock_boto.connect_s3().lookup.return_value = True
     mock_boto.connect_s3().get_bucket.assert_called_with("mybucket")
     self.assertTrue(mock_write.called)
コード例 #13
0
    def test_session_write_mode(self):
        """
        Write stream should use a custom boto3.Session
        """
        session = boto3.Session()
        session.resource = mock.MagicMock()

        smart_open.smart_open('s3://bucket/key', 'wb', s3_session=session)
        session.resource.assert_called_with('s3')
コード例 #14
0
    def write_read_assertion(self, test_file):
        with smart_open.smart_open(test_file, 'wb') as fout:  # 'b' for binary, needed on Windows
            fout.write(self.TEXT.encode('utf8'))

        with smart_open.smart_open(test_file, 'rb') as fin:
            self.assertEqual(fin.read().decode('utf8'), self.TEXT)

        if os.path.isfile(test_file):
            os.unlink(test_file)
コード例 #15
0
    def test_gzip_write_mode(self):
        """Should always open in binary mode when writing through a codec."""
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='bucket')
        uri = smart_open_lib._parse_uri("s3://bucket/key.gz")

        with mock.patch('smart_open.s3.open') as mock_open:
            smart_open.smart_open("s3://bucket/key.gz", "wb")
            mock_open.assert_called_with('bucket', 'key.gz', 'wb')
コード例 #16
0
 def prepend_line(infile, outfile, line):
     """ 
     Function to prepend lines using smart_open
     """
     with smart_open.smart_open(infile, ' rb ') as old:
         with smart_open.smart_open(outfile, ' wb ') as new:
             new.write(str(line) + " \n ")
             for line in old:
                 new.write(line)
     return outfile
コード例 #17
0
    def test_readline(self):
        """Does readline() return the correct file content?"""
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='mybucket')
        test_string = u"hello žluťoučký world!\nhow are you?".encode('utf8')
        with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
            fout.write(test_string)

        reader = smart_open.smart_open("s3://mybucket/mykey", "rb")
        self.assertEqual(reader.readline(), u"hello žluťoučký world!\n".encode("utf-8"))
コード例 #18
0
    def test_s3_mode_mock(self, mock_write, mock_boto):
        """Are s3:// open modes passed correctly?"""
        # Configure the mock boto.config.get to return default host
        smart_open.smart_open_lib.boto.config.get.return_value = 's3.amazonaws.com'

        # correct write mode, correct s3 URI
        smart_open.smart_open("s3://mybucket/mykey", "w")
        mock_boto.connect_s3.assert_called_with(aws_access_key_id=None, aws_secret_access_key=None, profile_name=None, host='s3.amazonaws.com')
        mock_boto.connect_s3().lookup.return_value = True
        mock_boto.connect_s3().get_bucket.assert_called_with("mybucket")
        self.assertTrue(mock_write.called)
コード例 #19
0
 def test_read_encoding_implicit_text(self):
     """Should open the file with the correct encoding, implicit text read."""
     s3 = boto3.resource('s3')
     s3.create_bucket(Bucket='bucket')
     key = "s3://bucket/key.txt"
     text = u'это знала ева, это знал адам, колеса любви едут прямо по нам'
     with smart_open.smart_open(key, 'wb') as fout:
         fout.write(text.encode('koi8-r'))
     with smart_open.smart_open(key, encoding='koi8-r') as fin:
         actual = fin.read()
     self.assertEqual(text, actual)
コード例 #20
0
    def test_write_encoding(self):
        """Should open the file for writing with the correct encoding."""
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='bucket')
        key = "s3://bucket/key.txt"
        text = u'какая боль, какая боль, аргентина - ямайка, 5-0'

        with smart_open.smart_open(key, 'w', encoding='koi8-r') as fout:
            fout.write(text)
        with smart_open.smart_open(key, encoding='koi8-r') as fin:
            actual = fin.read()
        self.assertEqual(text, actual)
コード例 #21
0
    def test_hdfs(self, mock_subprocess):
        """Is HDFS line iterator called correctly?"""
        mock_subprocess.PIPE.return_value = "test"
        smart_open_object = smart_open.smart_open("hdfs:///tmp/test.txt")
        smart_open_object.__iter__()
        # called with the correct params?
        mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE)

        # second possibility of schema
        smart_open_object = smart_open.smart_open("hdfs://tmp/test.txt")
        smart_open_object.__iter__()
        mock_subprocess.Popen.assert_called_with(["hdfs", "dfs", "-cat", "/tmp/test.txt"], stdout=mock_subprocess.PIPE)
コード例 #22
0
    def test_readline_eof(self):
        """Does readline() return empty string on EOF?"""
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='mybucket')
        with smart_open.smart_open("s3://mybucket/mykey", "wb"):
            pass

        reader = smart_open.smart_open("s3://mybucket/mykey", "rb")

        self.assertEqual(reader.readline(), b"")
        self.assertEqual(reader.readline(), b"")
        self.assertEqual(reader.readline(), b"")
コード例 #23
0
    def test_s3_iter_lines(self):
        """Does s3_iter_lines give correct content?"""
        # create fake bucket and fake key
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='mybucket')
        test_string = u"hello žluťoučký world!\nhow are you?".encode('utf8')
        with smart_open.smart_open("s3://mybucket/mykey", "wb") as fin:
            fin.write(test_string)

        # call s3_iter_lines and check output
        reader = smart_open.smart_open("s3://mybucket/mykey", "rb")
        output = list(reader)
        self.assertEqual(b''.join(output), test_string)
コード例 #24
0
    def test_write_bad_encoding_replace(self):
        """Should open the file for writing with the correct encoding."""
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='bucket')
        key = "s3://bucket/key.txt"
        text = u'欲しい気持ちが成長しすぎて'
        expected = u'?' * len(text)

        with smart_open.smart_open(key, 'w', encoding='koi8-r', errors='replace') as fout:
            fout.write(text)
        with smart_open.smart_open(key, encoding='koi8-r') as fin:
            actual = fin.read()
        self.assertEqual(expected, actual)
コード例 #25
0
    def test_read_never_returns_none(self):
        """read should never return None."""
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='mybucket')

        test_string = u"ветер по морю гуляет..."
        with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
            fout.write(test_string.encode('utf8'))

        r = smart_open.smart_open("s3://mybucket/mykey", "rb")
        self.assertEqual(r.read(), test_string.encode("utf-8"))
        self.assertEqual(r.read(), b"")
        self.assertEqual(r.read(), b"")
コード例 #26
0
    def test_gzip_read_mode(self):
        """Should always open in binary mode when reading through a codec."""
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='bucket')
        key = "s3://bucket/key.gz"

        text = u"если-б я был султан и имел трёх жён, то тройной красотой был бы окружён"
        with smart_open.smart_open(key, "wb") as fout:
            fout.write(text.encode("utf-8"))

        with mock.patch('smart_open.s3.open') as mock_open:
            smart_open.smart_open(key, "r")
            mock_open.assert_called_with('bucket', 'key.gz', 'rb')
コード例 #27
0
    def test_write_bad_encoding_replace(self):
        """Should replace characters that failed to encode."""
        text = u'欲しい気持ちが成長しすぎて'
        expected = u'?' * len(text)

        with tempfile.NamedTemporaryFile('wb', delete=True) as infile:
            with smart_open.smart_open(infile.name, 'w', encoding='koi8-r',
                                       errors='replace') as fout:
                fout.write(text)
            with smart_open.smart_open(infile.name, 'r', encoding='koi8-r') as fin:
                actual = fin.read()

        self.assertEqual(expected, actual)
コード例 #28
0
    def test_r(self):
        """Reading a UTF string should work."""
        text = u"физкульт-привет!"

        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='bucket')
        key = s3.Object('bucket', 'key')
        key.put(Body=text.encode('utf-8'))

        with smart_open.smart_open('s3://bucket/key', "rb") as fin:
            self.assertEqual(fin.read(), text.encode('utf-8'))

        with smart_open.smart_open('s3://bucket/key', "r", encoding='utf-8') as fin:
            self.assertEqual(fin.read(), text)
コード例 #29
0
    def test_readline_iter(self):
        """Does __iter__ return the correct file content?"""
        s3 = boto3.resource('s3')
        s3.create_bucket(Bucket='mybucket')
        lines = [u"всем привет!\n", u"что нового?"]
        with smart_open.smart_open("s3://mybucket/mykey", "wb") as fout:
            fout.write("".join(lines).encode("utf-8"))

        reader = smart_open.smart_open("s3://mybucket/mykey", "rb")

        actual_lines = [l.decode("utf-8") for l in reader]
        self.assertEqual(2, len(actual_lines))
        self.assertEqual(lines[0], actual_lines[0])
        self.assertEqual(lines[1], actual_lines[1])
コード例 #30
0
    def test_s3_modes_moto(self):
        """Do s3:// open modes work correctly?"""
        # fake bucket and key
        conn = boto.connect_s3()
        conn.create_bucket("mybucket")
        test_string = b"second test"

        # correct write mode, correct s3 URI
        with smart_open.smart_open("s3://mybucket/newkey", "wb") as fin:
            fin.write(test_string)

        output = list(smart_open.smart_open("s3://mybucket/newkey", "rb"))

        self.assertEqual(output, [test_string])
コード例 #31
0
def copy_from_s3_file_to_db(key):
    f = smart_open(key, 'r')
    cur.copy_from(f, db_table, sep=',')
    f.close()
コード例 #32
0
ファイル: utils.py プロジェクト: zaffnet/gensim
def unpickle(fname):
    """Load pickled object from `fname`"""
    with smart_open(fname) as f:
        # Because of loading from S3 load can't be used (missing readline in smart_open)
        return _pickle.loads(f.read())
コード例 #33
0
 def __iter__(self):
     with smart_open.smart_open('ruwiki_text_lem_comp.txt',
                                'r',
                                encoding='utf-8') as text:
         for line in text:
             yield list(tokenize(line))
コード例 #34
0
def train(x_train, y_train, word_id_dict, x_dev, y_dev):
    # Training
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
          allow_soft_placement=FLAGS.allow_soft_placement,
          log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(FLAGS.flag_values_dict())

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            decayed_lr = tf.train.exponential_decay(FLAGS.lr, global_step, 1000, FLAGS.lr_decay, staircase=True)
            optimizer = tf.train.AdamOptimizer(decayed_lr)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram("{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge([loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir, sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(), max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            with smart_open.smart_open(os.path.join(out_dir, "vocab"), 'wb') as f:
                pickle.dump(word_id_dict, f)
            with smart_open.smart_open(os.path.join(out_dir, "config"), 'wb') as f:
                pickle.dump(FLAGS.flag_values_dict(), f)

            # Initialize all variables
            sess.run(tf.global_variables_initializer())

            if FLAGS.word2vec:
                print("Loading W2V data...")
                pre_emb = KeyedVectors.load_word2vec_format(FLAGS.word2vec, binary=True)
                pre_emb.init_sims(replace=True)
                num_keys = len(pre_emb.vocab)
                print("loaded word2vec len ", num_keys)

                # initial matrix with random uniform
                initW = np.random.uniform(-0.25, 0.25, (FLAGS.vocab_size, FLAGS.embedding_dim))
                # load any vectors from the word2vec
                print("init initW cnn.W in FLAG")
                for w in word_id_dict.keys():
                    arr = []
                    s = re.sub('[^0-9a-zA-Z]+', '', w)
                    if w in pre_emb:
                        arr = pre_emb[w]
                    elif w.lower() in pre_emb:
                        arr = pre_emb[w.lower()]
                    elif s in pre_emb:
                        arr = pre_emb[s]
                    elif s.isdigit():
                        arr = pre_emb['1']
                    if len(arr) > 0:
                        idx = word_id_dict[w]
                        initW[idx] = np.asarray(arr).astype(np.float32)
                print("assigning initW to cnn. len=" + str(len(initW)))
                sess.run(cnn.W.assign(initW))

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                  cnn.input_x: x_batch,
                  cnn.input_y: y_batch,
                  cnn.dropout_keep_prob: FLAGS.dropout_keep_prob
                }
                _, step, lr, summaries, loss, accuracy = sess.run(
                    [train_op, global_step, decayed_lr, train_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, lr{:g}, acc {:g}".format(time_str, step, loss, lr, accuracy))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                  cnn.input_x: x_batch,
                  cnn.input_y: y_batch,
                  cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy = sess.run(
                    [global_step, dev_summary_op, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
                if writer:
                    writer.add_summary(summaries, step)
                return accuracy

            # Generate batches
            batches = dh.batch_iter(
                list(zip(x_train, y_train)), FLAGS.batch_size, FLAGS.num_epochs)
            # Training loop. For each batch...
            max = 0
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    accuracy = dev_step(x_dev, y_dev, writer=dev_summary_writer)
                    print("")
                    if accuracy > max:
                        max = accuracy
                        path = saver.save(sess, checkpoint_prefix, global_step=current_step)
                        print("Saved model checkpoint to {}\n".format(path))
コード例 #35
0
 def read_from_s3_bucket_by_url(self, bucket, file, aws_access_key_id,
                                aws_secret_access_key):
     s3url = 's3://' + aws_access_key_id + ':' + aws_secret_access_key + '@' + bucket + '/' + file
     for line in smart_open.smart_open(s3url):
         print(line)
コード例 #36
0
def doc2vec(dataset_path, dataset_name, write_path, txt_path_list):

    # Convert text to lower-case and strip punctuation/symbols from words
    def normalize_text(text):
        norm_text = text.lower()
        # Replace breaks with spaces
        norm_text = norm_text.replace('<br />', ' ')
        # Pad punctuation with spaces on both sides
        norm_text = re.sub(r"([\.\",\(\)!\?;:])", " \\1 ", norm_text)
        return norm_text

    alldata_path = os.path.join(write_path,
                                'alldata-id_' + dataset_name + '.txt')

    if not os.path.isfile(alldata_path):

        # Collect & normalize test/train data
        print("Cleaning up dataset...")

        # list of the absolute paths of every text file
        print(" %i files" % (len(txt_path_list)))

        # for each file "txt"
        for i, txt in tqdm(enumerate(txt_path_list)):
            with smart_open(txt, "rb") as t:

                try:
                    # "one_text" is the whole document
                    one_text = t.read().decode("utf-8")
                    for c in control_chars:
                        one_text = one_text.replace(c, ' ')
                    one_text = normalize_text(one_text)
                    all_lines.append(one_text)
                except UnicodeDecodeError:

                    # we skip this file, but we need to preserve index pos
                    all_lines.append(" ")
                    continue

        # Save to disk for instant re-use on any future runs
        with smart_open(alldata_path, 'wb') as f:
            for idx, line in enumerate(all_lines):
                num_line = u"_*{0} {1}\n".format(idx, line)
                f.write(num_line.encode("utf-8"))

    assert os.path.isfile(alldata_path), "alldata unavailable"
    print("Success, alldata is available for next steps.")

    #===================================================================
    #=#BLOCK#=#: Read in alldata
    #===================================================================

    # this data object class suffices as a `TaggedDocument`
    # (with `words` and `tags`)
    # plus adds other state helpful for our later evaluation/reporting

    with smart_open(alldata_path, 'rb', encoding='utf-8') as alldata:
        alldata_list = list(alldata)
        print("Iterating up to: ", len(alldata_list))
    with smart_open(alldata_path, 'rb', encoding='utf-8') as alldata:
        documents = [
            TaggedDocument(doc, [i]) for i, doc in tqdm(enumerate(alldata))
        ]
        model = Doc2Vec(documents,
                        vector_size=5,
                        window=2,
                        min_count=1,
                        workers=4)

        fname = get_tmpfile(
            os.path.join(write_path, "doc2vec_model_" + dataset_name))
        model.save(fname)
        model = Doc2Vec.load(
            fname)  # you can continue training with the loaded model!

    return
コード例 #37
0
 def test_shortcut(self):
     fpath = os.path.join(CURR_DIR, 'test_data/crime-and-punishment.txt')
     with mock.patch('smart_open.smart_open_lib.open') as mock_open:
         smart_open.smart_open(fpath, 'r').read()
     mock_open.assert_called_with(fpath, 'r', buffering=-1)
コード例 #38
0
 def test_s3_upload_is_none(self, mock_session):
     smart_open.smart_open("s3://bucket/key", 'wb', s3_upload=None)
     s3_resource = mock_session.return_value.resource.return_value
     s3_object = s3_resource.Object.return_value
     s3_object.initiate_multipart_upload.assert_called()
コード例 #39
0
 def test_file(self, mock_smart_open):
     """Is file:// line iterator called correctly?"""
     smart_open_object = smart_open.smart_open("file:///tmp/test.txt", "rb")
     smart_open_object.__iter__()
     # called with the correct path?
     mock_smart_open.assert_called_with("/tmp/test.txt", "rb")
コード例 #40
0
def nlp_preprocess(filepath_dict: dict,
                   col: str,
                   df=None,
                   verbose: bool = True,
                   overwrite_interim: bool = True) -> pd.DataFrame:
    def clean_doc(corpus):
        '''
        generator function to read in docs from the file,
        and substitute and remove substrings
        '''
        for doc in corpus:
            yield au_tu.remove_substrings(au_tu.clean_tokens(
                doc,
                tokens=to_replace_dict,
                whole_words_only=whole_words_only,
                ignore_case=ignore_case,
            ),
                                          to_remove_list=to_remove_list,
                                          whole_words_only=whole_words_only,
                                          ignore_case=ignore_case)

    def tokenize_entities(parsed_doc):
        txt = parsed_doc.text
        for ent in parsed_doc.ents:
            txt = txt[:ent.start_char] + ent.text.replace(
                ' ', '_') + txt[ent.end_char:]
        return txt

    def cleaned_doc_corpus(corpus):
        '''
        generator function to use spaCy to parse docs, clean docs,
        tokenize named entities, and yield documents
        '''
        for parsed_doc in nlp.pipe(clean_doc(corpus),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):
            yield tokenize_entities(parsed_doc)

    def punct_space_more(token):
        '''
        helper function to eliminate tokens that are
        pure punctuation or whitespace or digits or only 1 character
        '''
        return (
            token.is_punct or token.is_space or token.is_digit
            or token.text == "'s" or token.lemma_ == '-PRON-' or
            # token.lemma_ == 'say' or
            # token.lemma_ == 'tell' or
            # token.lemma_ == 'be' or
            len(token.text) <= 1)

    def line_doc(filename):
        '''
        generator function to read in docs from the file,
        un-escape the original line breaks in the text,
        and do additional cleaning
        '''
        def hyp_to_us(doc):
            return re.sub(r'\b-\b', '_', doc)

        def remove_punct(doc):
            # keep: alphanumberic (w), spaces (s), single quote, underscore
            return re.sub(r'[^\w\s\'_]+', '', doc)

        # with codecs.open(filename, encoding='utf_8') as f:
        with smart_open(filename) as f:
            for doc in f:
                yield remove_punct(hyp_to_us(doc.decode())).replace(
                    '\\n', '\n')

    def lemmatized_sentence_corpus(filename):
        '''
        generator function to use spaCy to parse docs,
        lemmatize the text, and yield sentences
        '''
        for parsed_doc in nlp.pipe(line_doc(filename),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):

            for sent in parsed_doc.sents:
                yield ' '.join([
                    token.lemma_ for token in sent
                    if not punct_space_more(token)
                ])

    if verbose:
        logger.info(f'Working on text from: {col}')

    # # debug - only getting from the sample dataframe here
    # df_phrased = df.loc[df[col].notnull(), ['tfa_master_uid', 'app_year', col]].sample(n=50).copy()

    df_phrased = df.loc[df[col].notnull(),
                        ['tfa_master_uid', 'app_year', col]].copy()

    nlp = spacy.load('en', disable=[])

    # clean text and tokenize entities
    if verbose:
        logger.info('Cleaning docs...')
    df_phrased[col] = list(cleaned_doc_corpus(df_phrased[col].values))
    # remove 'the_' from NER tokens
    df_phrased[col] = df_phrased[col].apply(
        lambda x: ' '.join([re.sub('^the_', 'the ', y) for y in x.split()]))
    if verbose:
        logger.info('\tDone.')

    # create & open a new file in write mode
    if verbose:
        logger.info('Saving documents, one per line...')
    doc_count = 0
    with codecs.open(filepath_dict['doc_txt_filepath'], 'w',
                     encoding='utf_8') as doc_txt_file:
        for doc in df_phrased[[col]].apply(lambda x: ' '.join(x),
                                           axis=1).tolist():
            # write the doc as a line in the new file
            # escape newline characters in the original doc text
            doc_txt_file.write(doc.replace('\n', '\\n') + '\n')
            doc_count += 1
    if verbose:
        logger.info(
            f"Text from {doc_count:,} docs written to: {filepath_dict['doc_txt_filepath']}"
        )

    nlp = spacy.load('en', disable=['ner'])

    # lemmatize and save sentences

    if overwrite_interim:
        if verbose:
            logger.info(
                f"Processing documents into unigram sentences: {filepath_dict['unigram_sentences_filepath']}"
            )
        # with codecs.open(filepath_dict['unigram_sentences_filepath'], 'w', encoding='utf_8') as f:
        with smart_open(filepath_dict['unigram_sentences_filepath'], 'w') as f:
            for sentence in lemmatized_sentence_corpus(
                    filepath_dict['doc_txt_filepath']):
                f.write(sentence + '\n')
            if verbose:
                logger.info('Done.')
        unigram_sentences = LineSentence(
            filepath_dict['unigram_sentences_filepath'])

        if verbose:
            logger.info('Unigram examples:')
            for unigram_sentence in it.islice(unigram_sentences, 10, 20):
                logger.info(u' '.join(unigram_sentence))
                logger.info('=' * 30)

        if verbose:
            logger.info('Finding bigram phrases')
        # create the bigram model
        bigram = Phrases(unigram_sentences,
                         min_count=phrase_min_count,
                         threshold=phrase_threshold,
                         max_vocab_size=phrase_max_vocab_size,
                         progress_per=phrase_progress_per,
                         scoring=phrase_scoring,
                         common_terms=phrase_common_terms)
        bigram_model = Phraser(bigram)
        bigram_model.save(filepath_dict['bigram_model_filepath'])

        if verbose:
            logger.info(
                f"Saving bigram phrased sentences: {filepath_dict['bigram_sentences_filepath']}"
            )
        # save bigram sentences
        with codecs.open(filepath_dict['bigram_sentences_filepath'],
                         'w',
                         encoding='utf_8') as f:
            for unigram_sentence in unigram_sentences:
                bigram_sentence = u' '.join(bigram_model[unigram_sentence])
                f.write(bigram_sentence + '\n')

        bigram_sentences = LineSentence(
            filepath_dict['bigram_sentences_filepath'])
        if verbose:
            logger.info('Bigram examples:')
            for bigram_sentence in it.islice(bigram_sentences, 10, 20):
                logger.info(u' '.join(bigram_sentence))
                logger.info('=' * 30)

        if verbose:
            logger.info('Finding trigram phrases')
        # create the trigram model
        trigram = Phrases(bigram_sentences,
                          min_count=phrase_min_count,
                          threshold=phrase_threshold,
                          max_vocab_size=phrase_max_vocab_size,
                          progress_per=phrase_progress_per,
                          scoring=phrase_scoring,
                          common_terms=phrase_common_terms)
        trigram_model = Phraser(trigram)
        trigram_model.save(filepath_dict['trigram_model_filepath'])

        if verbose:
            logger.info(
                f"Saving trigram phrased sentences: {filepath_dict['trigram_sentences_filepath']}"
            )
        # create trigram sentences
        with codecs.open(filepath_dict['trigram_sentences_filepath'],
                         'w',
                         encoding='utf_8') as f:
            for bigram_sentence in bigram_sentences:
                trigram_sentence = u' '.join(trigram_model[bigram_sentence])
                f.write(trigram_sentence + '\n')

        trigram_sentences = LineSentence(
            filepath_dict['trigram_sentences_filepath'])
        if verbose:
            logger.info('Trigram examples:')
            for trigram_sentence in it.islice(trigram_sentences, 10, 20):
                logger.info(u' '.join(trigram_sentence))
                logger.info('=' * 30)

    if verbose:
        logger.info(
            f"Saving phrased docs using saved models: {filepath_dict['trigram_docs_filepath']}"
        )
    # using saved models, write transformed text out to a new file, one doc per line
    with codecs.open(filepath_dict['trigram_docs_filepath'],
                     'w',
                     encoding='utf_8') as f:
        for parsed_doc in nlp.pipe(line_doc(filepath_dict['doc_txt_filepath']),
                                   batch_size=nlp_batch_size,
                                   n_threads=nlp_n_threads):

            # removing punctuation and whitespace
            unigram_doc = [
                token.lemma_ for token in parsed_doc
                if not punct_space_more(token)
            ]

            # apply the first-order and second-order phrase models
            bigram_doc = bigram_model[unigram_doc]
            trigram_doc = trigram_model[bigram_doc]

            # remove any remaining stopwords
            trigram_doc = [
                term for term in trigram_doc
                if term not in nlp.Defaults.stop_words
            ]

            #extend the stop workds
            stop_words_extend = [
                'from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say',
                'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done',
                'try', 'many', 'some', 'nice', 'thank', 'think', 'see',
                'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want',
                'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also',
                'may', 'take', 'come'
            ]
            trigram_doc = [
                term for term in trigram_doc if term not in stop_words_extended
            ]

            # write the transformed doc as a line in the new file
            trigram_doc = ' '.join(trigram_doc)
            f.write(trigram_doc + '\n')
    if verbose:
        logger.info('Done.')

    # put the text back in the dataframe
    trigram_docs = LineSentence(filepath_dict['trigram_docs_filepath'])

    if len([doc for doc in trigram_docs]) == df_phrased.shape[0]:
        for i, doc in enumerate(trigram_docs):
            df_phrased.iloc[i, df_phrased.columns.get_loc(col)] = ' '.join(doc)
    else:
        raise ValueError(
            'Different number of processed and original documents')

    # save dataframe
    if verbose:
        logger.info('Saving NLP processed data: {}'.format(
            filepath_dict['filepath_out']))
    df_phrased.to_csv(filepath_dict['filepath_out'])

    return df_phrased
コード例 #41
0
    def train(cls,
              wr_path,
              corpus_file,
              out_name,
              size=100,
              window=15,
              symmetric=1,
              min_count=5,
              max_vocab_size=0,
              sgd_num=100,
              lrate=0.001,
              period=10,
              iter=90,
              epsilon=0.75,
              dump_period=10,
              reg=0,
              alpha=100,
              beta=99,
              loss='hinge',
              memory=4.0,
              np=1,
              cleanup_files=False,
              sorted_vocab=1,
              ensemble=0):
        """
        The word and context embedding files are generated by wordrank binary and are saved in "out_name" directory
        which is created inside wordrank directory. The vocab and cooccurence files are generated using glove code
        available inside the wordrank directory. These files are used by the wordrank binary for training.

        `wr_path` is the absolute path to the Wordrank directory.
        `corpus_file` is the filename of the text file to be used for training the Wordrank model.
        Expects file to contain space-separated tokens in a single line
        `out_name` is name of the directory which will be created (in wordrank folder)
        to save embeddings and training data.
        It will contain following contents:

            Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt
            Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt
            A meta directory which contain: 'vocab.txt' - vocab words,
            'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths

        `size` is the dimensionality of the feature vectors.
        `window` is the number of context words to the left (and to the right, if symmetric = 1).
        `symmetric` if 0, only use left context words, else use left and right both.
        `min_count` = ignore all words with total frequency lower than this.
        `max_vocab_size` upper bound on vocabulary size, i.e. keep the <int> most frequent words.
        Default is 0 for no limit.
        `sgd_num` number of SGD taken for each data point.
        `lrate` is the learning rate (too high diverges, give Nan).
        `period` is the period of xi variable updates
        `iter` = number of iterations (epochs) over the corpus.
        `epsilon` is the power scaling value for weighting function.
        `dump_period` is the period after which embeddings should be dumped.
        `reg` is the value of regularization parameter.
        `alpha` is the alpha parameter of gamma distribution.
        `beta` is the beta parameter of gamma distribution.
        `loss` = name of the loss (logistic, hinge).
        `memory` = soft limit for memory consumption, in GB.
        `np` number of copies to execute. (mpirun option)
        `cleanup_files` if True, delete directory and files used by this wrapper,
        setting to False can be useful for debugging
        `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before assigning word indexes.
        `ensemble` = 0 (default), use ensemble of word and context vectors
        """

        # prepare training data (cooccurrence matrix and vocab)
        model_dir = os.path.join(wr_path, out_name)
        meta_dir = os.path.join(model_dir, 'meta')
        os.makedirs(meta_dir)
        logger.info("Dumped data will be stored in '%s'", model_dir)
        copyfile(corpus_file, os.path.join(meta_dir,
                                           corpus_file.split('/')[-1]))

        vocab_file = os.path.join(meta_dir, 'vocab.txt')
        temp_vocab_file = os.path.join(meta_dir, 'tempvocab.txt')
        cooccurrence_file = os.path.join(meta_dir, 'cooccurrence')
        cooccurrence_shuf_file = os.path.join(meta_dir, 'wiki.toy')
        meta_file = os.path.join(meta_dir, 'meta')

        cmd_vocab_count = [
            os.path.join(wr_path, 'glove', 'vocab_count'), '-min-count',
            str(min_count), '-max-vocab',
            str(max_vocab_size)
        ]
        cmd_cooccurence_count = [
            os.path.join(wr_path, 'glove', 'cooccur'), '-memory',
            str(memory), '-vocab-file', temp_vocab_file, '-window-size',
            str(window), '-symmetric',
            str(symmetric)
        ]
        cmd_shuffle_cooccurences = [
            os.path.join(wr_path, 'glove', 'shuffle'), '-memory',
            str(memory)
        ]
        cmd_del_vocab_freq = ['cut', '-d', " ", '-f', '1', temp_vocab_file]

        commands = [
            cmd_vocab_count, cmd_cooccurence_count, cmd_shuffle_cooccurences
        ]
        input_fnames = [
            os.path.join(meta_dir,
                         os.path.split(corpus_file)[-1]),
            os.path.join(meta_dir,
                         os.path.split(corpus_file)[-1]), cooccurrence_file
        ]
        output_fnames = [
            temp_vocab_file, cooccurrence_file, cooccurrence_shuf_file
        ]

        logger.info("Prepare training data (%s) using glove code",
                    ", ".join(input_fnames))
        for command, input_fname, output_fname in zip(commands, input_fnames,
                                                      output_fnames):
            with smart_open(input_fname, 'rb') as r:
                with smart_open(output_fname, 'wb') as w:
                    utils.check_output(w, args=command, stdin=r)

        logger.info("Deleting frequencies from vocab file")
        with smart_open(vocab_file, 'wb') as w:
            utils.check_output(w, args=cmd_del_vocab_freq)

        with smart_open(vocab_file, 'rb') as f:
            numwords = sum(1 for _ in f)
        with smart_open(cooccurrence_shuf_file, 'rb') as f:
            numlines = sum(1 for _ in f)
        with smart_open(meta_file, 'wb') as f:
            meta_info = "{0} {1}\n{2} {3}\n{4} {5}".format(
                numwords, numwords, numlines,
                cooccurrence_shuf_file.split('/')[-1], numwords,
                vocab_file.split('/')[-1])
            f.write(meta_info.encode('utf-8'))

        if iter % dump_period == 0:
            iter += 1
        else:
            logger.warning(
                "Resultant embedding will be from %d iterations rather than the input %d iterations, "
                "as wordrank dumps the embedding only at dump_period intervals. "
                "Input an appropriate combination of parameters (iter, dump_period) "
                "such that \"iter mod dump_period\" is zero.",
                iter - (iter % dump_period), iter)

        wr_args = {
            'path': meta_dir,
            'nthread': multiprocessing.cpu_count(),
            'sgd_num': sgd_num,
            'lrate': lrate,
            'period': period,
            'iter': iter,
            'epsilon': epsilon,
            'dump_prefix': 'model',
            'dump_period': dump_period,
            'dim': size,
            'reg': reg,
            'alpha': alpha,
            'beta': beta,
            'loss': loss
        }

        # run wordrank executable with wr_args
        cmd = ['mpirun', '-np', str(np), os.path.join(wr_path, 'wordrank')]
        for option, value in wr_args.items():
            cmd.append('--%s' % option)
            cmd.append(str(value))
        logger.info("Running wordrank binary")
        utils.check_output(args=cmd)

        # use embeddings from max. iteration's dump
        max_iter_dump = iter - (iter % dump_period)
        os.rename('model_word_%d.txt' % max_iter_dump,
                  os.path.join(model_dir, 'wordrank.words'))
        os.rename('model_context_%d.txt' % max_iter_dump,
                  os.path.join(model_dir, 'wordrank.contexts'))
        model = cls.load_wordrank_model(
            os.path.join(model_dir, 'wordrank.words'), vocab_file,
            os.path.join(model_dir, 'wordrank.contexts'), sorted_vocab,
            ensemble)

        if cleanup_files:
            rmtree(model_dir)
        return model
コード例 #42
0
ファイル: __init__.py プロジェクト: tiehan/Rhea_Chip
 def save(self, d, pickle_protocol=2):
     handle = smart_open(self.fname, 'w')
     cPickle.dump(d, handle, protocol=pickle_protocol)
     handle.close()
コード例 #43
0
ファイル: __init__.py プロジェクト: tiehan/Rhea_Chip
 def load(self):
     with smart_open(self.fname) as f:
         return cPickle.loads(f.read())
コード例 #44
0
def writer_count(select_count):
    with smart_open(count_sentences_path, 'wb', encoding='utf-8') as fin:
        for name, count in select_count.items():
            fin.write(name + " " + str(count) + "\n")
    return "重新添加完成"
コード例 #45
0
ファイル: filetokenizer.py プロジェクト: warrior1127/CS484
def serializeObject(fileName, obj):
    with smart_open.smart_open(fileName, "wb") as f:
        pickle.dump(obj, f)
コード例 #46
0
    def test_file_mode_mock(self, mock_file, mock_boto):
        """Are file:// open modes passed correctly?"""
        # incorrect file mode
        self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "x")

        # correct read modes
        smart_open.smart_open("blah", "r")
        mock_file.assert_called_with("blah", "r")

        smart_open.smart_open("blah", "rb")
        mock_file.assert_called_with("blah", "rb")

        short_path = "~/blah"
        full_path = os.path.expanduser(short_path)
        smart_open.smart_open(short_path, "rb")
        mock_file.assert_called_with(full_path, "rb")

        # correct write modes, incorrect scheme
        self.assertRaises(NotImplementedError, smart_open.smart_open, "hdfs:///blah.txt", "wb+")
        self.assertRaises(NotImplementedError, smart_open.smart_open, "http:///blah.txt", "w")
        self.assertRaises(NotImplementedError, smart_open.smart_open, "s3://bucket/key", "wb+")

        # correct write mode, correct file:// URI
        smart_open.smart_open("blah", "w")
        mock_file.assert_called_with("blah", "w")

        smart_open.smart_open("file:///some/file.txt", "wb")
        mock_file.assert_called_with("/some/file.txt", "wb")

        smart_open.smart_open("file:///some/file.txt", "wb+")
        mock_file.assert_called_with("/some/file.txt", "wb+")

        smart_open.smart_open("file:///some/file.txt", "w+")
        mock_file.assert_called_with("/some/file.txt", "w+")
コード例 #47
0
ファイル: filetokenizer.py プロジェクト: warrior1127/CS484
def deserializeObject(fileName):
    with smart_open.smart_open(fileName, "rb") as f:
        return pickle.load(f)
コード例 #48
0
# Business IDs of the restaurants.
ids = [
    'EAwh1OmG6t6p3nRaZOW_AA', 'pomGBqfbxcqPv14c3XH-ZQ',
    'iCQpiavjjPzJ5_3gPD5Ebg', 'UBv8heCQR0RPnUQG0zkXIQ',
    '7m1Oa1VYV98UUuo_6i0EZg'
]

w2v_corpus = []  # Documents to train word2vec on (all 6 restaurants).
wmd_corpus = []  # Documents to run queries against (only one restaurant).
documents = [
]  # wmd_corpus, with no pre-processing (so we can see the original documents).

biz = {}

with smart_open('yelp_dataset/yelp_academic_dataset_business.json',
                'rb') as business_data_file:
    for line in business_data_file:
        json_line = json.loads(line)
        biz[json_line['business_id']] = json_line['name']

review_data_dict = {}
votes = {}

with smart_open('yelp_dataset/yelp_academic_dataset_review.json',
                'rb') as data_file:
    for line in data_file:
        json_line = json.loads(line)

        if json_line['business_id'] not in ids:
            # Not one of the 6 restaurants.
            continue
コード例 #49
0
        help="Input file, in gloVe format (read-only).")
    parser.add_argument(
        "-o", "--output", required=True,
        help="Output file, in word2vec text format (will be overwritten).")
    args = parser.parse_args()

    # do the actual conversion
    num_lines, num_dims = glove2word2vec(args.input, args.output)
    logger.info('Converted model with %i vectors and %i dimensions', num_lines, num_dims)
    # test that the converted model loads successfully
    model = gensim.models.KeyedVectors.load_word2vec_format(args.output, binary=False)
    logger.info('Model %s successfully loaded', model)
    try:
        logger.info('testing the model....')
        if sys.version_info < (3,):
            with smart_open(args.output, 'rb') as f:
                seed_word1, seed_word2 = random.sample([line.split()[0] for line in f], 2)
        else:
            with smart_open(args.output, 'r') as f:
                seed_word1, seed_word2 = random.sample([line.split()[0] for line in f], 2)
        logger.info('top-10 most similar words to "%s" are: %s', seed_word1, model.most_similar(positive=[seed_word1], topn=10))
        logger.info('similarity score between %s and %s: %s', seed_word1, seed_word2, model.similarity(seed_word1, seed_word2))
    except:
        logger.error('error encountered. checking for model file creation now....')
        if os.path.isfile(os.path.join(args.output)):
            logger.info('model file %s was created but could not be loaded.', args.output)
        else:
            logger.info('model file %s creation failed. ')
        logger.info('please check the parameters and input file format.')
        raise
コード例 #50
0
    tableSplitList = table.split('<td style="border:1px solid saddlebrown;"')
    if len(tableSplitList) < 2:
        print 'table split went poorly %s' % url
        return url
    pdList, column_list = table_splitter(tableData)

    list_of_titles += ['NDIC Index']
    df = pd.DataFrame(pdList, columns = listOfTitles)
    return df


if __name__ == '__main__':
    password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
    top_level_url = prefix + dfScoutFilt.iloc[0].prodLink
    opener = secure_open(password_mgr, top_level_url)
    all_links = get_ndic_with_production_data()
    pandas_list = []  ## append dataframes to a list, then concat list for big prod table
    for production_link in all_links.keys():   
        try:
            datalines = get_prod_url_data(HTTP_PREFIX + production_link, opener)
            temp_df, column_names = split_html_table(datalines)
            pandas_list.append(temp_df, column_names)
        except:
            raise 
    dfConcat = pd.concat(pandasList)
    with smart_open.smart_open(S3_DIR + 'production_data.p', 'wb') as fout:
        pickle.dump(dfConcat. fout)



コード例 #51
0
def cleanLinksFromFile(inFile):
    inFile = normpath(inFile)
    # create a pool of N workers
    #N = cpu_count()-1

    #i in range(0,N,1)
    #N = 1 #188 pages/sec
    #N = 2 # 237 pages/sec
    #N = 3 # #273
    #N = 4 #304
    #N = 5 #310
    #N = 6 #320
    #N = 7 #330
    #N = 8 #323

    #i in range(0,2*N,1)
    #N=1
    #N=2
    #N=3 #330
    #N=4 #350
    #N=5 #333
    N=6  #355
    #N=7 #356
    #N=8 #340




    if debug == True:
        N=1
    start = time.clock()
    pool = Pool(N)
    heartBeat = 5000*N  #heartbeat
    exportChunk = 100000  # lines
    if debug==True:
        exportChunk = 5  # lines
    finalOutput1 = []
    finalOutput2 = []
    finalOutput3 = []
    keepGoing = True
    currentLine = 0
    writeFileFirstTime(inFile)
    start = time.time()
    with smart_open.smart_open(inFile) as fp:
        while keepGoing == True:
            data = []
            for i in range(0, 2*N, 1):
                try:  # assume there is more data
                    data.append(fp.next())  # build up a chunk of data
                    currentLine += 1
                except:
                    print 'reached end of file'
                    keepGoing = False

            output = pool.map(cleanParentsAndChildren, data)
            #output = map(parseParentsAndChildren, data)


            #TO DO: IF OUTPUT IS LARGE - WRITE IT TO A FILE - CREATE APPEND
            for line in output:
                finalOutput1.append(line[0])
                finalOutput2.append(line[1])
                finalOutput3.append(line[2])

            if (len(finalOutput1)>=exportChunk):
                writeAllFiles(inFile,finalOutput1,finalOutput2,finalOutput3)
                finalOutput1 = []
                finalOutput2 = []
                finalOutput3 = []

            if currentLine % heartBeat == 0:
                eTime = time.time()-start
                start = time.time()
                print 'currentLine: ', currentLine, 'est. l/sec: ', heartBeat/float(eTime)
                sys.stdout.flush()

#            if (currentLine>206350):
#                print currentLine
    #EXPORT THE DATA AND ZERO OUT THE LISTS
    writeAllFiles(inFile, finalOutput1, finalOutput2, finalOutput3)
    finalOutput1 = []
    finalOutput2 = []
    finalOutput3 = []


    pool.close()
    pool.join()


    return
コード例 #52
0
 def test_no_kwargs(self, mock_session):
     smart_open.smart_open('s3://mybucket/mykey')
     mock_session.assert_called_with(profile_name=None)
     mock_session.return_value.resource.assert_called_with('s3')
コード例 #53
0
 def test_file_buffering2(self, mock_smart_open):
     smart_open_object = smart_open.smart_open('/tmp/somefile', 'rb', 0)
     smart_open_object.__iter__()
     # called with the correct expanded path?
     mock_smart_open.assert_called_with('/tmp/somefile', 'rb', buffering=0)
コード例 #54
0
 def test_binary(self):
     with mock.patch(_BUILTIN_OPEN,
                     mock.Mock(return_value=self.bytesio)) as mock_open:
         with smart_open.smart_open("blah", "rb") as fin:
             self.assertEqual(fin.read(), self.as_bytes)
             mock_open.assert_called_with("blah", "rb", buffering=-1)
コード例 #55
0
 def test_profile(self, mock_session):
     smart_open.smart_open('s3://mybucket/mykey',
                           profile_name='my_credentials')
     mock_session.assert_called_with(profile_name='my_credentials')
     mock_session.return_value.resource.assert_called_with('s3')
#!/usr/bin/python3.6

#########################
# pyhton3 file kafka_producer.py
# read data from s3 and send it to kafka-cluster
#########################

from kafka import KafkaProducer
from smart_open import smart_open
import yaml
from random import randint
import sys
import time

# read config file
with open("config.yml", 'r') as ymlfile:
    config = yaml.load(ymlfile)

fpath = config['s3_fpath_user'] + '_' + str(sys.argv[1]) + '.csv'
producer = KafkaProducer(bootstrap_servers=config['bootstrap_servers_address'])

dt = 4 / float(sys.argv[2]) - 1 / 600
for user in smart_open(fpath, 'rb'):
    # send the sessions
    producer.send('requests', user[:-1])
    producer.flush()
    time.sleep(dt)
    def predicting(self):
        import boto3
        from metaflow import S3
        import re
        import pandas as pd
        from nltk import tokenize
        import string
        import nltk
        from nltk.corpus import stopwords
        from smart_open import smart_open
        import requests

        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('words')

        listed = []
        with smart_open('s3://inputbucket1221/testdata.txt', 'r') as s3_source:
            Line = s3_source.readline()

            while Line != '':
                Line1 = Line.split(".")
                for Sentence in Line1:
                    listed.append(Sentence)
                Line = s3_source.readline()

        L = []
        for x in listed:
            if len(x) > 5:
                L.append(x)

        df = pd.DataFrame()

        df['Text'] = L
        print(df['Text'])

        def remove_punct(text):
            text = "".join(
                [char for char in text if char not in string.punctuation])
            text = re.sub('[0-9]+', '', text)
            return text

        df['Textclean'] = df['Text'].apply(lambda x: remove_punct(x))
        df = df.dropna()

        url = 'http://localhost:5000/model/predict'
        #myobj = {"The Model Asset Exchange is a crucial element of a developer's toolkit."}
        #data = {'text':['We did it!']}

        #x = requests.post(url, json = data)

        #print(x.text)
        senti_list = []
        for x in df['Textclean']:
            data = {'text': [x]}
            res = (requests.post(url, json=data)).text.split()
            if float(res[4][:-1]) > float(res[6][:-3]):
                temp = (x, "Positive")
                senti_list.append(temp)
            elif float(res[4][:-1]) < float(res[6][:-3]):
                temp = (x, "Negative")
                senti_list.append(temp)
            else:
                temp = (x, "Neutral")
                senti_list.append(temp)
        sentiment = pd.DataFrame(senti_list, columns=['Sentence', 'Sentiment'])

        #sentiment.to_csv('sample.csv',index=False)

        sentiment.to_csv('final_sentiment.csv', index=False)

        with S3(s3root='s3://outputbucket1221/') as s3:
            s3.put_files([('final_sentiment.csv', 'final_sentiment.csv')])

        self.next(self.end)
コード例 #58
0
            return []


# In[ ]:

processed = []
chunk_size = 5000

# data = pd.DataFrame()
letters = [
    "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O",
    "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"
]
for letter in letters:
    csv_name = "s3://millionsongprocessed/data_" + letter + ".tsv"
    with smart_open.smart_open(csv_name, 'w') as fout:
        writer = csv.writer(fout, delimiter='\t', lineterminator='\n')
        print(letter)
        for prefix in get_prefixes(letter):

            if (transform_s3(prefix) == []):
                continue
            processed.append(transform_s3(prefix))

            if len(processed) % chunk_size == 0:
                #                 part_data = rows_to_file(processed)
                print("running")

                #             data = pd.concat([data, part_data], axis=0)
                writer.writerows(processed)
                processed = []
コード例 #59
0
    def get_records_batch(self, hook, query_filter):
        # Chunks the records and streams to s3 by specified batchsize.

        if query_filter == '':
            query_filter = 'WHERE'
        else:
            query_filter = query_filter + ' AND '

        count_sql_max = """
        SELECT max({0}) as c FROM {1} """.format(self.primary_key,
                                                 self.mssql_table)

        count_sql_min = """
        SELECT min({0}) as c FROM {1}  WHERE {0}>0""".format(
            self.primary_key, self.mssql_table)

        count_sql_max_incremental = """
                SELECT max({0}) as c FROM {1} """.format(
            self.incremental_key, self.mssql_table)

        count_sql_min_incremental = """
                        SELECT min({0}) as c FROM {1} """.format(
            self.incremental_key, self.mssql_table)

        # if query_filter != 'WHERE':
        #    # Remove the AND from the query filter so you're only batching
        #    # for incremental loads within your timerange. Assumes primary_key
        # is incremental.
        #   count_sql_max += query_filter.split("AND")[0]
        #  #count_sql_min += query_filter.split("AND")[0]

        count = int(hook.get_pandas_df(count_sql_max)['c'][0])
        min_count = int(hook.get_pandas_df(count_sql_min)['c'][0])

        max_date = (hook.get_pandas_df(count_sql_max_incremental)['c'][0])
        min_date = (hook.get_pandas_df(count_sql_min_incremental)['c'][0])
        print(count_sql_min)
        print(count)
        print(min_count)

        s3_conn = BaseHook('S3').get_connection(self.s3_conn_id)
        s3_creds = s3_conn.extra_dejson

        if s3_key_suffix:
            s3_key = s3_key.split(
                ".")[0] + min_date + '-' + max_date + s3_key.split(".")[1]

        s3_key = '{}/{}'.format(self.s3_bucket, self.s3_key)

        url = 's3://{}:{}@{}'.format(s3_creds['aws_access_key_id'],
                                     s3_creds['aws_secret_access_key'], s3_key)

        logging.info('Initiating record retrieval in batches.')
        logging.info('Query'.format(count_sql_min))
        logging.info(count_sql_min)
        logging.info('Start Date: {0}'.format(self.start))
        logging.info('End Date: {0}'.format(self.end))
        logging.info('smallest_number: {0}'.format(min_count))
        logging.info('count: {0}'.format(count))

        # Smart Open is a library for efficiently streaming large files to S3.
        # Streaming data to S3 here so it doesn't break the task container.
        # https://pypi.python.org/pypi/smart_open
        # Does this here because smart_open doesn't yet support an
        # append mode and doing it as a function was causing the file to be
        # overwritten every time.

        with smart_open.smart_open(url, 'wb') as fout:
            logging.info("First Row {0}".format(min_count)),
            logging.info("Total Rows: {0}".format(count))
            logging.info("Batch Size: {0}".format(self.batchsize))
            for batch in range(min_count, count, self.batchsize):
                query = \
                    """
                    SELECT  *
                    FROM {table}
                    {query_filter} {primary_key} >= {batch}
                    AND {primary_key} < {batch_two};
                    """.format(count=count,
                               table=self.mssql_table,
                               primary_key=self.primary_key,
                               query_filter=query_filter,
                               batch=batch,
                               batch_two=batch + self.batchsize)

                logging.info(query)

                # Perform query and convert returned tuple to list
                results = list(hook.get_records(query))
                logging.info(
                    'Successfully performed query for batch {0}-{1}.'.format(
                        batch, (batch + self.batchsize)))

                results = [
                    dict([k.lower(), str(v)] if v is not None else [k, v]
                         for k, v in i.items()) for i in results
                ]
                results = '\n'.join([json.dumps(i) for i in results])
                # Write the results to bytes.
                results = results.encode('utf-8')
                logging.info("Uploading!")
                fout.write(results)
コード例 #60
0
    writer = avro.datafile.DataFileWriter(foutd, avro.io.DatumWriter(), schema)
    for ll, row in enumerate(dictRes):
        writer.append(row)
    writer.close()


#
# The above two functions appear to work identically.
#
write_avro = write_avro_context_manager

with open('local.avro', 'wb') as foutd:
    logging.critical('writing to %r', foutd)
    write_avro(foutd)

with smart_open.smart_open('local-so.avro', 'wb') as foutd:
    logging.critical('writing to %r', foutd)
    write_avro(foutd)
subprocess.check_call(['diff', 'local.avro', 'local-so.avro'])
print('sanity check OK')


def split_s3_url(url):
    parsed = urlparse.urlparse(url)
    return parsed.netloc, parsed.path[1:]


def read_avro(fin):
    reader = avro.datafile.DataFileReader(fin, avro.io.DatumReader())
    return list(reader)