Example #1
0
def eval_on_file(path_composed_emb, path_observed_emb, save_path):
    raw_observed_space = Space.build(data=path_observed_emb, format='dm')
    observed_space = raw_observed_space.apply(RowNormalization('length'))
    observed_words = observed_space.get_id2row()
    print("Observed words, size: " + str(len(observed_words)) + ", first:")
    print(observed_words[:10])
    observed_words_set = set(observed_words)

    raw_composed_space = Space.build(data=path_composed_emb, format='dm')
    composed_space = raw_composed_space.apply(RowNormalization('length'))
    composed_words = composed_space.get_id2row()
    print("Composed words, size: " + str(len(composed_words)) + ", first:")
    print(composed_words[:10])

    # all composed words should be in the initial space
    for idx, word in enumerate(composed_words):
        assert (word in observed_words_set)

    q1, q2, q3, ranks = evaluateRank(composed_words, composed_space,
                                     observed_space)
    print("Q1: " + str(q1) + ", Q2: " + str(q2) + ", Q3: " + str(q3))

    printDictToFile(ranks, save_path + '_rankedCompounds.txt')

    sortedRanks = sorted(ranks.values())
    printListToFile(sortedRanks, save_path + '_ranks.txt')
    logResult(q1, q2, q3, save_path + '_quartiles.txt')

    return q1, q2, q3, ranks
Example #2
0
    def test_build_data(self):

        test_cases = [("data1",["red", "blue"], ["car", "man"],
                       np.mat([[3,5],[0,10]]), np.mat([[3,5],[0,10]])),
                      ("data2",["red"], ["car"],
                       np.mat([[3]]), np.mat([[3]])),
                      ("data3",["red", "blue"], ["car", "man"],
                       np.mat([[15,0],[0,6]]), np.mat([[5,0],[0,6]])),
                      ("data7",["red"], ["car"], np.mat([[0]]), np.mat([[0]])),
                      ("data9",["man"], ["car"], np.mat([[4]]), None),
                      ]
        for data_file, rows, cols, smat, dmat in test_cases:
            data_file1 = self.dir_ + data_file + ".sparse"

            sp = Space.build(data=data_file1,
                             cols= self.dir_ + data_file + ".cols",
                             format="sm")
            self.assertListEqual(rows, sp.id2row)
            self.assertListEqual(cols, sp.id2column)

            self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix)
            np.testing.assert_array_equal(smat,
                                          sp.cooccurrence_matrix.mat.todense())

            data_file2 = self.dir_ + data_file + ".dense"
            if not dmat is None:
                sp = Space.build(data=data_file2, format="dm")
                self.assertListEqual(rows, sp.id2row)
                self.assertListEqual([], sp.id2column)

                self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix)
                np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
Example #3
0
def train_from_core(lexical_space_file, an_dn_file, pn_file, sv_file, vo_file, output_file_prefix):
    
    if (not exists(lexical_space_file) or not exists(pn_file) or not exists(sv_file)
        or not exists(vo_file) or not exists(an_dn_file)):
        print "some file doesn't exist"
        print lexical_space_file, an_dn_file, pn_file, sv_file, vo_file
    
    print "load core"
    core_space = Space.build(data=lexical_space_file, format="dm")
    print "load an dn"
    
    an_dn_space = Space.build(data=an_dn_file, format="dm")
    print "load pn"
    pn_space = Space.build(data=pn_file, format="dm")
    print "load sv"
    sv_space = Space.build(data=sv_file, format="dm")
    print "load vo"
    vo_space = Space.build(data=vo_file, format="dm")
    
    print "start training"
    all_mat_space_normed = train_all_spaces(core_space, an_dn_space, 
                                     pn_space, sv_space, vo_space)
    print "exporting trained file"
    all_mat_space_normed.export(output_file_prefix, format="dm")
    del all_mat_space_normed
    print "DONE"
Example #4
0
    def test_simple_dense(self):

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "mat2", "-o", self.dir_, "--input_format", "dm",
            "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat2.dm", format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)

        self._test_equal_spaces_dense(s1, s2)
        self._test_equal_spaces_dense(s1, s3)

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "CORE_SS.mat2", "-o", self.dir_, "--input_format",
            "pkl", "--output_format", "dm"
        ])

        s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space)
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)

        self._test_equal_spaces_dense(s1, s3)
Example #5
0
    def setUp(self):
        self.ft = ["f1", "f2"]

        self.n_space = Space(DenseMatrix(np.mat([[3, 4], [5, 6]])),
                             ["car", "man"], self.ft)
        self.an_space = Space(DenseMatrix(np.mat([[3, 4], [5, 6]])),
                              ["a1_car", "a1_man"], self.ft)
    def test_simple_sparse_zipped(self):
            
        bcs.main(["build_core_space.py", 
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat1", 
                  "-o", self.dir_,
                  "--input_format", "sm",
                  "--output_format", "sm",
                  "--gz", "True"
                  ])
        
        s1 = Space.build(data=self.dir_ + "mat1.sm.gz",
                         cols= self.dir_ + "mat1.cols",
                         format = "sm")

        s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm",
                         cols=self.dir_ + "CORE_SS.mat1.cols",
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space)
        s4 = Space.build(data=self.dir_ + "mat1.sm",
                         cols= self.dir_ + "mat1.cols",
                         format = "sm")
                
        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)
        self._test_equal_spaces_sparse(s1, s4)
def main():
    """
    Convert temporal referencing matrix to regular (binned) matrix.
    """

    # Get the arguments
    args = docopt(
        """Convert temporal referencing matrix to regular (binned) matrix.

    Usage:
        tr2bin.py (-w | -s) <spacePrefix> <ref> <outPath>

        <spacePrefix> = path to pickled space without suffix
        <ref> = reference string
        <outPath> = output path for result file

    Options:
        -w, --w2v   save in w2v format
        -s, --sps   save in sparse matrix format
        
    """)

    is_w2v = args['--w2v']
    is_sps = args['--sps']
    spacePrefix = args['<spacePrefix>']
    ref = args['<ref>']
    outPath = args['<outPath>']

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Load spaces
    space = load_pkl_files(spacePrefix)
    matrix = space.get_cooccurrence_matrix().get_mat()
    id2row = space.get_id2row()
    id2column = space.get_id2column()

    ti = [(spl[0], i) for i, w in enumerate(id2row) for spl in [w.split('_')]
          if len(spl) == 1 or (len(spl) == 2 and spl[1] == ref)]
    targets, indices = zip(*ti)

    new_matrix = matrix[list(indices), :]

    # Save the Space objects
    if is_w2v:
        new_space = Space(DenseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=True)
    if is_sps:
        new_space = Space(SparseMatrix(new_matrix), list(targets), id2column)
        save_pkl_files(new_space,
                       outPath,
                       save_in_one_file=True,
                       save_as_w2v=False)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #8
0
def eval_on_file(path_composed_emb, path_observed_emb, save_path):
    raw_observed_space = Space.build(data=path_observed_emb, format='dm')
    observed_space = raw_observed_space.apply(RowNormalization('length'))
    observed_words = observed_space.get_id2row()
    print("Observed words, size: " + str(len(observed_words)) + ", first:")
    print(observed_words[:10])
    observed_words_set = set(observed_words)

    raw_composed_space = Space.build(data=path_composed_emb, format='dm')
    composed_space = raw_composed_space.apply(RowNormalization('length'))
    composed_words = composed_space.get_id2row()
    print("Composed words, size: " + str(len(composed_words)) + ", first:")
    print(composed_words[:10])

    # all composed words should be in the initial space
    for idx, word in enumerate(composed_words):
        assert(word in observed_words_set)

    q1, q2, q3, ranks = evaluateRank(composed_words, composed_space, observed_space)
    print("Q1: " + str(q1) + ", Q2: " + str(q2) + ", Q3: " + str(q3))

    printDictToFile(ranks, save_path + '_rankedCompounds.txt')
    
    sortedRanks = sorted(ranks.values())
    printListToFile(sortedRanks, save_path + '_ranks.txt')
    logResult(q1, q2, q3, save_path + '_quartiles.txt')

    return q1,q2,q3,ranks
Example #9
0
    def setUp(self):
        self.a = np.array([[1, 2, 3], [4, 0, 5]])
        self.space_s = Space(SparseMatrix(np.mat(self.a)), ["a", "b"],
                             ["f1", "f2", "f3"])

        self.space_d = Space(DenseMatrix(np.mat(self.a)), ["a", "b"],
                             ["f1", "f2", "f3"])
Example #10
0
    def setUp(self):
        self.dir_ = data_dir + "/space_test_resources/"
        self.init_test_cases = [(DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"],
                       ["feat1", "feat2"],
                       {"man":1, "car":0},
                       {"feat1":0, "feat2":1},
                       [ScalingOperation(EpmiWeighting())]),
                      (DenseMatrix(np.array([[1,2],[3,4]])),
                       ["car", "man"],
                       [],
                       {"man":1, "car":0},
                       {},
                       [ScalingOperation(EpmiWeighting())])]

        self.m1 = np.array([[1,2,3]])
        self.row1 = ["a"]
        self.row2 = ["a", "b", "c"]
        self.ft1 = ["f1","f2","f3"]
        self.space1 = Space(DenseMatrix(self.m1),self.row1, self.ft1)

        self.x = np.mat([[1,2,3],[2,4,6],[4,675,43]])
        self.us = np.mat([[  2.19272110e+00,   3.03174768e+00],
                               [  4.38544220e+00,   6.06349536e+00],
                               [  6.76369708e+02,  -4.91431927e-02]])
        self.space2 = Space(DenseMatrix(self.x), self.row2, self.ft1)
    def test_simple_dense(self):
            
        bcs.main(["build_core_space.py", 
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat2", 
                  "-o", self.dir_,
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])
        
        s1 = Space.build(data = self.dir_ + "mat2.dm", format = "dm")
        s2 = Space.build(data = self.dir_ + "CORE_SS.mat2.dm", format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)
        
        self._test_equal_spaces_dense(s1, s2)
        self._test_equal_spaces_dense(s1, s3)        
 
        bcs.main(["build_core_space.py", 
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "CORE_SS.mat2", 
                  "-o", self.dir_,
                  "--input_format", "pkl",
                  "--output_format", "dm"
                  ])
        
        s1 = io_utils.load(self.dir_ + "CORE_SS.CORE_SS.mat2.pkl", Space)
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat2.pkl", Space)
        
        self._test_equal_spaces_dense(s1, s3)  
    def test_as_conversion_tool(self):
        
        bcs.main(["build_core_space.py", 
                  "-i", self.dir_ + "mat3", 
                  "-o", self.dir_,
                  "--input_format", "sm",
                  "--output_format", "sm"
                  ])        
        
        s1 = Space.build(data=self.dir_ + "mat3.sm",
                         cols= self.dir_ + "mat3.cols",
                         format = "sm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols", 
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)
        
        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)
        
        bcs.main(["build_core_space.py", 
                  "-i", self.dir_ + "mat3", 
                  "-o", self.dir_,
                  "--input_format", "sm",
                  "--output_format", "dm"
                  ])
        
        s1 = Space.build(data=self.dir_ + "mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format = "dm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format = "dm")                 
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        self._test_equal_spaces_dense(s1, s2)
        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)
        
        bcs.main(["build_core_space.py", 
                  "-i", self.dir_ + "mat3", 
                  "-o", self.dir_,
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])        
       
        s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format = "dm")                 
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)
        
        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)
 def test_simple_lstsq_no_inter(self):
     tc.main(["train_composition.py", 
               "-l", self.dir_ + "log1.txt",
               "-i", self.dir_ + "an_train_data.txt", 
               "-o", self.dir_,
               "-m", "lexical_func",
               "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
               "-a", self.dir_ + "CORE_SS.N_mat.pkl",
               "-r", "lstsq",
               "--intercept", "False",
               "--export_params", "True"
               ]) 
     
     trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
     new_space = trained.function_space
     np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat, 
                                          np.mat([1,0,0,1]), 10)
     self.assertTupleEqual(new_space.element_shape, (2,2))
     self.assertListEqual(new_space.id2row, ["big"])
     self.assertListEqual(new_space.id2column, [])
     
     a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", 
                           format="dm")
     
     self._test_equal_spaces_dense(a_space, new_space)
     
     tc.main(["train_composition.py", 
               "-l", self.dir_ + "log1.txt",
               "-i", self.dir_ + "an_train_data.txt", 
               "-o", self.dir_,
               "-m", "lexical_func",
               "-p", self.dir_ + "CORE_SS.AN_mat.pkl",
               "-a", self.dir_ + "CORE_SS.N_mat.pkl",
               "-r", "ridge",
               "--lambda", "0",
               "--crossvalidation", "False",
               "--intercept", "False",
               "--export_params", "True"
               ]) 
     
     trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
     new_space2 = trained.function_space
     np.testing.assert_array_almost_equal(new_space2.cooccurrence_matrix.mat, 
                                          np.mat([1,0,0,1]), 10)
     self.assertTupleEqual(new_space2.element_shape, (2,2))
     self.assertListEqual(new_space2.id2row, ["big"])
     self.assertListEqual(new_space2.id2column, [])
     
     a_space = Space.build(data=self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm", 
                           format="dm")
     
     self._test_equal_spaces_dense(a_space, new_space2)
Example #14
0
    def test_simple_ops(self):

        bcs.main(["build_core_space.py",
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat3",
                  "-w", "raw",
                  "-s", "top_sum_3,top_length_3,top_sum_4",
                  "-r", "svd_2,svd_1",
                  "-o", self.dir_,
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])

        core_mats = ["CORE_SS.mat3.raw.top_sum_3.svd_2",
                     "CORE_SS.mat3.raw.top_sum_3.svd_1",
                     "CORE_SS.mat3.raw.top_length_3.svd_2",
                     "CORE_SS.mat3.raw.top_length_3.svd_1",
                     "CORE_SS.mat3.raw.top_sum_4.svd_2",
                     "CORE_SS.mat3.raw.top_sum_4.svd_1"
                     ]

        core_spaces = [Space.build(data=self.dir_ + suffix + ".dm", format="dm") for suffix in core_mats]

        for i, core_mat in enumerate(core_mats):
            bps.main(["build_peripheral_space.py",
                      "-l", self.dir_ + "log1.txt",
                      "-i", self.dir_ + "mat3",
                      "-o", self.dir_,
                      "-c", self.dir_ + core_mat + ".pkl",
                      "--input_format", "dm",
                      "--output_format", "dm"
                      ])

            s1 = core_spaces[i]
            data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm"
            s2 = Space.build(data=data_file, format="dm")
            self._test_equal_spaces_dense(s1, s2)

            bps.main(["build_peripheral_space.py",
                      "-l", self.dir_ + "log1.txt",
                      "-i", self.dir_ + "mat3",
                      "-o", self.dir_,
                      "-c", self.dir_ + core_mat + ".pkl",
                      "--input_format", "sm",
                      "--output_format", "dm"
                      ])

            s1 = core_spaces[i]
            data_file = self.dir_ + "PER_SS.mat3." + core_mats[i] + ".dm"
            s2 = Space.build(data=data_file, format="dm")

            self._test_equal_spaces_dense(s1, s2)
Example #15
0
    def test_dilation(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = Dilation()
        m.export(self.prefix + ".dil1")
        m.train([("a", "b", "a_b")], self.space1, self.space2)
        m.export(self.prefix + ".dil2")
Example #16
0
    def test_weighted_additive(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = WeightedAdditive()
        m.export(self.prefix + ".add1")
        m.train([("a", "a", "a_a")], self.space1, self.space2)
        m.export(self.prefix + ".add2")
Example #17
0
    def test_simple_dense(self):
        bps.main(["build_peripheral_space.py",
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat2",
                  "-o", self.dir_,
                  "-c", self.dir_ + "CORE_SS.mat2.pkl",
                  "--input_format", "dm",
                  "--output_format", "dm"
                  ])
        s1 = Space.build(data=self.dir_ + "mat2.dm", format="dm")
        s2 = Space.build(data=self.dir_ + "PER_SS.mat2.CORE_SS.mat2.dm", format="dm")

        self._test_equal_spaces_dense(s1, s2)
Example #18
0
    def test_train_intercept(self):
        a1_mat = DenseMatrix(np.mat([[3, 4], [5, 6]]))
        a2_mat = DenseMatrix(np.mat([[1, 2], [3, 4]]))

        train_data = [("a1", "man", "a1_man"),
                      ("a2", "car", "a2_car"),
                      ("a1", "boy", "a1_boy"),
                      ("a2", "boy", "a2_boy")
        ]

        n_mat = DenseMatrix(np.mat([[13, 21], [3, 4], [5, 6]]))
        n_space = Space(n_mat, ["man", "car", "boy"], self.ft)

        an1_mat = (a1_mat * n_mat.transpose()).transpose()
        an2_mat = (a2_mat * n_mat.transpose()).transpose()
        an_mat = an1_mat.vstack(an2_mat)

        an_space = Space(an_mat, ["a1_man", "a1_car", "a1_boy", "a2_man", "a2_car", "a2_boy"], self.ft)

        #test train
        model = LexicalFunction(learner=LstsqRegressionLearner(intercept=True))
        model.train(train_data, n_space, an_space)
        a_space = model.function_space

        a1_mat.reshape((1, 4))
        #np.testing.assert_array_almost_equal(a1_mat.mat,
        #                                     a_space.cooccurrence_matrix.mat[0])

        a2_mat.reshape((1, 4))
        #np.testing.assert_array_almost_equal(a2_mat.mat,
        #                                     a_space.cooccurrence_matrix.mat[1])

        self.assertListEqual(a_space.id2row, ["a1", "a2"])
        self.assertTupleEqual(a_space.element_shape, (2, 3))

        #test compose
        a1_mat = DenseMatrix(np.mat([[3, 4, 5, 6]]))
        a2_mat = DenseMatrix(np.mat([[1, 2, 3, 4]]))
        a_mat = a_space.cooccurrence_matrix

        a_space = Space(a_mat, ["a1", "a2"], [], element_shape=(2, 3))
        model = LexicalFunction(function_space=a_space, intercept=True)
        comp_space = model.compose(train_data, n_space)

        self.assertListEqual(comp_space.id2row, ["a1_man", "a2_car", "a1_boy", "a2_boy"])
        self.assertListEqual(comp_space.id2column, [])

        self.assertEqual(comp_space.element_shape, (2,))

        np.testing.assert_array_almost_equal(comp_space.cooccurrence_matrix.mat,
                                             an_mat[[0, 4, 2, 5]].mat, 8)
Example #19
0
    def test_simple_lstsq_no_inter(self):
        tc.main([
            "train_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m",
            "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a",
            self.dir_ + "CORE_SS.N_mat.pkl", "-r", "lstsq", "--intercept",
            "False", "--export_params", "True"
        ])

        trained = io_utils.load(
            self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        new_space = trained.function_space
        np.testing.assert_array_almost_equal(new_space.cooccurrence_matrix.mat,
                                             np.mat([1, 0, 0, 1]), 10)
        self.assertTupleEqual(new_space.element_shape, (2, 2))
        self.assertListEqual(new_space.id2row, ["big"])
        self.assertListEqual(new_space.id2column, [])

        a_space = Space.build(
            data=self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
            format="dm")

        self._test_equal_spaces_dense(a_space, new_space)

        tc.main([
            "train_composition.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "an_train_data.txt", "-o", self.dir_, "-m",
            "lexical_func", "-p", self.dir_ + "CORE_SS.AN_mat.pkl", "-a",
            self.dir_ + "CORE_SS.N_mat.pkl", "-r", "ridge", "--lambda", "0",
            "--crossvalidation", "False", "--intercept", "False",
            "--export_params", "True"
        ])

        trained = io_utils.load(
            self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        new_space2 = trained.function_space
        np.testing.assert_array_almost_equal(
            new_space2.cooccurrence_matrix.mat, np.mat([1, 0, 0, 1]), 10)
        self.assertTupleEqual(new_space2.element_shape, (2, 2))
        self.assertListEqual(new_space2.id2row, ["big"])
        self.assertListEqual(new_space2.id2column, [])

        a_space = Space.build(
            data=self.dir_ +
            "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.params.dm",
            format="dm")

        self._test_equal_spaces_dense(a_space, new_space2)
Example #20
0
    def test_as_conversion_tool(self):

        bcs.main([
            "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_,
            "--input_format", "sm", "--output_format", "sm"
        ])

        s1 = Space.build(data=self.dir_ + "mat3.sm",
                         cols=self.dir_ + "mat3.cols",
                         format="sm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.sm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)

        bcs.main([
            "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_,
            "--input_format", "sm", "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ + "mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="dm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         rows=self.dir_ + "CORE_SS.mat3.rows",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        self._test_equal_spaces_dense(s1, s2)
        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)

        bcs.main([
            "build_core_space.py", "-i", self.dir_ + "mat3", "-o", self.dir_,
            "--input_format", "dm", "--output_format", "dm"
        ])

        s1 = Space.build(data=self.dir_ + "CORE_SS.mat3.dm",
                         cols=self.dir_ + "CORE_SS.mat3.cols",
                         format="dm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat3.pkl", Space)

        s3.to_dense()
        self._test_equal_spaces_dense(s1, s3)
Example #21
0
    def test_lexical_function(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = LexicalFunction()
        m._MIN_SAMPLES = 1
        self.assertRaises(IllegalStateError, m.export, self.prefix + ".lf1")
        m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1,
                self.space2)
        m.export(self.prefix + ".lf2")
Example #22
0
    def test_full_additive(self):

        self.m12 = DenseMatrix(np.mat([[3, 1], [9, 2]]))
        self.m22 = DenseMatrix(np.mat([[4, 3], [2, 1]]))
        self.ph2 = DenseMatrix(np.mat([[18, 11], [24, 7]]))
        self.row = ["a", "b"]
        self.ft = ["f1", "f2"]
        self.space1 = Space(DenseMatrix(self.m12), self.row, self.ft)
        self.space2 = Space(DenseMatrix(self.ph2), ["a_a", "a_b"], self.ft)
        m = FullAdditive()
        self.assertRaises(IllegalStateError, m.export, self.prefix + ".full1")
        m.train([("a", "b", "a_b"), ("a", "a", "a_a")], self.space1,
                self.space2)

        m.export(self.prefix + ".full2")
Example #23
0
    def test_vstack_raises(self):

        space3 = Space(DenseMatrix(self.x[0:2,0:1]), ["e","f"], self.ft1[0:1])
        space4 = Space(DenseMatrix(self.x[0:2,:]), ["a","f"], self.ft1)
        space5 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], [])
        space6 = Space(DenseMatrix(self.x[0:2,:]), ["e","f"], ["f1","f2","f4"])

        test_cases = [(self.space2, space3),
                      (self.space2, space4),
                      (self.space2, space5),
                      (self.space2, space6)
                      ]

        for space1, space2 in test_cases:
            self.assertRaises(ValueError, space1.vstack, space1, space2)
Example #24
0
def test_to_dissect_sparse_files(vectors_c, tmpdir):
    """

    :type vectors_c: Thesaurus
    :type tmpdir: py.path.local
    """
    from composes.semantic_space.space import Space

    prefix = str(tmpdir.join('output'))
    vectors_c.to_dissect_sparse_files(prefix)
    # check that files are there
    for suffix in ['sm', 'rows', 'cols']:
        outfile = '{}.{}'.format(prefix, suffix)
        assert os.path.exists(outfile)
        assert os.path.isfile(outfile)

    # check that reading the files in results in the same matrix
    space = Space.build(data="{}.sm".format(prefix),
                        rows="{}.rows".format(prefix),
                        cols="{}.cols".format(prefix),
                        format="sm")

    matrix, rows, cols = space.cooccurrence_matrix.mat, space.id2row, space.id2column
    exp_matrix, exp_cols, exp_rows = vectors_c.to_sparse_matrix()

    assert exp_cols == cols
    assert exp_rows == rows
    assert_array_equal(exp_matrix.A, matrix.A)
    _assert_matrix_of_thesaurus_c_is_as_expected(matrix.A, rows, cols)
    _assert_matrix_of_thesaurus_c_is_as_expected(exp_matrix.A, exp_rows, exp_cols)
Example #25
0
def load_pkl_files(dsm_prefix):
    """
    Load the space from either a single pkl file or numerous files.
    :param dsm_prefix:
    :param dsm:
    """
    
    # Check whether there is a single pickle file for the Space object
    if os.path.isfile(dsm_prefix + '.pkl'):
        return io_utils.load(dsm_prefix + '.pkl')

    # Load the multiple files: npz for the matrix and pkl for the other data members of Space
    with np.load(dsm_prefix + 'cooc.npz') as loader:
        coo = coo_matrix((loader['data'], (loader['row'], loader['col'])), shape=loader['shape'])

    cooccurrence_matrix = SparseMatrix(csr_matrix(coo))

    with open(dsm_prefix + '_row2id.pkl', 'rb') as f_in:
        row2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2row.pkl', 'rb') as f_in:
        id2row = pickle.load(f_in)

    with open(dsm_prefix + '_column2id.pkl', 'rb') as f_in:
        column2id = pickle.load(f_in)

    with open(dsm_prefix + '_id2column.pkl', 'rb') as f_in:
        id2column = pickle.load(f_in)

    return Space(cooccurrence_matrix, id2row, id2column, row2id=row2id, column2id=column2id)
Example #26
0
def main():
    parser = argparse.ArgumentParser(
        description="Converts a vecf file to dissect pkl format.")
    parser.add_argument('--input',
                        '-i',
                        type=argparse.FileType('r'),
                        help='Input file')
    parser.add_argument('--output',
                        '-o',
                        type=argparse.FileType('w'),
                        help='Output file')
    args = parser.parse_args()

    header = args.input.readline().rstrip()
    vocab_s, dims = map(int, header.split(" "))

    vocab = []

    # init matrix
    matrix = np.zeros((vocab_s, dims), dtype=np.float)

    for i, line in enumerate(args.input):
        data = line.split()
        vector = np.array(map(float, data[1:]))
        word = data[0]
        vocab.append(word)
        matrix[i] = vector

    dm = DenseMatrix(matrix)
    sp = Space(dm, vocab, [])
    pickle.dump(sp, args.output)
    args.output.close()
Example #27
0
def main():
    """
    Transform EPMI matrix in npz format to SPPMI space and save as pickle file.
    """

    # Get the arguments
    args = docopt(
        '''Transform EPMI matrix in npz format to SPPMI space and save as pickle file.

    Usage:
        transform_matrix_epmi2sppmi.py <spacePrefix> <outPath> <k>

        <spacePrefix> = path to npz without suffix
        <outPath> = output path for space
        <k> = shifting parameter
    
    ''')

    spacePrefix = args['<spacePrefix>']
    outPath = args['<outPath>']
    k = int(args['<k>'])

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    logging.info(__file__.upper())
    start_time = time.time()

    # Get npz matrix
    with np.load(spacePrefix + '.npz') as loader:
        matrix = csr_matrix(
            (loader['data'], loader['indices'], loader['indptr']),
            shape=loader['shape'])

    with open(spacePrefix + '.words.vocab') as f:
        id2row = vocab = [line.strip() for line in f if len(line) > 0]

    with open(spacePrefix + '.contexts.vocab') as f:
        id2column = [line.strip() for line in f if len(line) > 0]

    # Apply log weighting
    matrix.data = np.log(matrix.data)

    # Shift values
    matrix.data -= np.log(k)

    # Eliminate negative counts
    matrix.data[matrix.data <= 0] = 0.0

    # Eliminate zero counts
    matrix.eliminate_zeros()

    # Create new space
    sparseSpace = Space(SparseMatrix(matrix), id2row, id2column)

    #print sparseSpace.get_cooccurrence_matrix()

    # Save the Space object in pickle format
    save_pkl_files(sparseSpace, outPath + 'ppmi.sm', save_in_one_file=True)

    logging.info("--- %s seconds ---" % (time.time() - start_time))
Example #28
0
def vstack(s1, s2):
    if not s1:
        return  s2
    if not s2:
        return s1
    else:
        return Space.vstack(s1, s2)
def add_zero_idenity_matrix(matrix_space, vector_length):
    zero_mat = np.zeros((1,vector_length * vector_length))
    identity_mat = np.reshape(np.eye(vector_length),(1, vector_length * vector_length))
    matrix = DenseMatrix(np.vstack([zero_mat, identity_mat]))
    rows = ["cg.zeromat","cg.identmat"]
    additional_space = Space(matrix, rows, [])
    return Space.vstack(matrix_space, additional_space)
Example #30
0
def build_raw_per_space(in_file_prefix, in_format, is_gz):

    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format)

    data_file = "%s.%s" % (in_file_prefix, in_format)

    if in_format == "pkl":
        space = io_utils.load(data_file, Space)

    else:
        if is_gz:
            data_file = "%s.gz" % data_file
        row_file = "%s.rows" % (in_file_prefix)
        column_file = "%s.cols" % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" % column_file)
            column_file = None
        print "Building matrix..."
        space = Space.build(data=data_file, rows=row_file, cols=column_file, format=in_format)

    return space
Example #31
0
def build_raw_per_space(in_file_prefix, in_format, is_gz):

    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format)

    data_file = '%s.%s' % (in_file_prefix, in_format)

    if in_format == "pkl":
        space = io_utils.load(data_file, Space)

    else:
        if is_gz:
            data_file = '%s.gz' % data_file
        row_file = '%s.rows' % (in_file_prefix)
        column_file = '%s.cols' % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" %
                                 column_file)
            column_file = None
        print("Building matrix...")
        space = Space.build(data=data_file,
                            rows=row_file,
                            cols=column_file,
                            format=in_format)

    return space
Example #32
0
def main():
    parser = argparse.ArgumentParser(
        'Converts a VW topic output to a COMPOSES pkl file.')
    parser.add_argument('--input',
                        '-i',
                        type=argparse.FileType('r'),
                        help='Input file')
    parser.add_argument('--docnames',
                        '-d',
                        type=argparse.FileType('r'),
                        help='Docnames file')
    parser.add_argument('--output',
                        '-o',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help='Output file')

    args = parser.parse_args()
    docnames = [l for l in (l.strip() for l in args.docnames) if l]
    matrix = None
    for i, line in enumerate(args.input):
        line = line.strip()
        weights = map(float, line.split(" "))
        if matrix is None:
            matrix = np.zeros((len(docnames), len(weights)), dtype=np.float)
        weights = np.array(weights)
        matrix[i] = weights

    dm = DenseMatrix(matrix)
    sp = Space(dm, docnames, [])
    pickle.dump(sp, args.output)
    args.output.close()
Example #33
0
def read_mikolov(spacefile):
    header = spacefile.readline().rstrip()
    vocab_s, dims = map(int, header.split(" "))

    vocab = []

    # init matrix
    matrix = np.zeros((vocab_s, dims), dtype=np.float)

    i = 0
    while True:
        line = spacefile.readline()
        if not line:
            break
        sep = line.find(" ")
        if sep == -1:
            raise ValueError(
                "Couldn't find the vocab/data separation character! Space file corruption?"
            )

        word = line[:sep]
        data = line[sep + 1:]
        if len(data) < FLOAT_SIZE * dims + 1:
            data += spacefile.read(FLOAT_SIZE * dims + 1 - len(data))
        data = data[:-1]
        vocab.append(word)
        vector = (struct.unpack("%df" % dims, data))
        matrix[i] = vector
        i += 1

    dm = DenseMatrix(matrix)
    sp = Space(dm, vocab, [])

    return sp
Example #34
0
    def test_build_data_row_col(self):
        test_cases = [("data1", "row1.row", "col1.col", ["red"], ["man", "car"],
                       np.mat([[5,3]]), np.mat([[3,5]])),
                      ("data1", "row1.row", "col5.col", ["red"], ["man", "car"],
                       np.mat([[5,3]]), np.mat([[3,5]])),
                      ("data3", "row2.row", "col2.col", ["blue", "red"], ["car"],
                       np.mat([[0],[15]]), None),
                      ("data2", "row1.row","col1.col", ["red"], ["man","car"],
                       np.mat([[0,3]]), None),
                      ("data3", "row3.row", "col3.col", ["blue", "red"], ["man", "car"],
                       np.mat([[6,0],[0,15]]), np.mat([[0,6],[5,0]])),
                      ("data7", "row2.row", "col3.col", ["blue", "red"], ["man", "car"],
                       np.mat([[0,0],[0,0]]), None),
                      ("data3", "row2.row", "col4.col", ["blue", "red"], ["airplane"],
                       np.mat([[0],[0]]), None)
                      ]

        for data_file, row_file, col_file, rows, cols, smat, dmat in test_cases:
            row_file = self.dir_ + row_file
            col_file = self.dir_ + col_file

            data_file1 = self.dir_ + data_file + ".sparse"

            if smat is None:
                self.assertRaises(ValueError, Space.build, data=data_file1, rows= row_file, cols=col_file, format="sm")

            else:
                sp = Space.build(data=data_file1, rows= row_file, cols=col_file, format="sm")
                self.assertListEqual(rows, sp.id2row)
                self.assertListEqual(cols, sp.id2column)

                self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix)
                np.testing.assert_array_equal(smat,
                                              sp.cooccurrence_matrix.mat.todense())

            data_file2 = self.dir_ + data_file + ".dense"

            if dmat is None:
                self.assertRaises(ValueError, Space.build, data=data_file2, rows= row_file, cols=col_file, format="dm")

            else:
                sp = Space.build(data=data_file2, rows= row_file, cols=col_file, format="dm")
                self.assertListEqual(rows, sp.id2row)
                self.assertListEqual(cols, sp.id2column)

                self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix)
                np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
def learn_TENSOR_matrix (  ) :

	bigram_space = load_space(args.function[2])
	my_comp_list = []
	id2row_list = []
	adj_list = extract_adj(bigram_space)

	for adj in adj_list :        
        	
           	train_data=[]		

        	for bigram in bigram_space.id2row :

	    		pair = bigram.split('_')
            		if( not pair[0] == adj ) :
				continue
	    		train_data.append(("ADJ"+"_"+adj, pair[1], bigram))
			# eg ( "ADJ_good", "boy", "good_boy"), where "ADJ_good" -> matrix to learn, boy -> unigram , good_boy -> bigram
				

		my_comp=LexicalFunction()  # 1)

		#Learn ADJ matrix for each adjective
        	my_comp.train(train_data, unigram_space, bigram_space)
        	my_comp_list.append(my_comp.function_space.cooccurrence_matrix)
        	id2row_list.append(my_comp.function_space.id2row)

        my_mat_id2row=id2row_list.pop()
	my_mat_space=Space(my_comp_list.pop(),my_mat_id2row,[])

	#Create a new space using the ADJ matrices created
	for i in range(len(id2row_list)):
    		my_mat_id2row.extend(id2row_list[i])
    		my_mat_space=Space(my_mat_space.cooccurrence_matrix.vstack(my_comp_list[i]),my_mat_id2row,[])
    		my_mat_space._element_shape = my_comp.function_space.element_shape

	#Use the ADJ matrices space to learn the tensor matrix
	train_data=[('tens_adj',adj,"ADJ"+"_"+adj) for adj in adj_list] 
        # eg ( "tens_adj", good, ADJ_good ) 
        #where "tens_adj" -> tensor matrix to learn, good -> unigram , ADJ_good -> adjective matrix learnt by 'my_comp' in 1)


	my_tens_adj=LexicalFunction()
	my_tens_adj.train(train_data, unigram_space, my_mat_space)
	# unigram_space -> for "good" , my_mat_space -> for "ADJ_good"

	save_space(my_tens_adj, "TENSOR_matrix", "matrices")
def add_one_zero_vector(core_space):
    length = core_space.cooccurrence_matrix.shape[1]
    zero_vector = np.zeros((1,length))
    one_vector = np.ones((1,length))
    matrix = DenseMatrix(np.vstack([zero_vector, one_vector]))
    rows = ["cg.zerovec","cg.onevec"]
    additional_space = Space(matrix, rows, [])
    return Space.vstack(core_space, additional_space)
Example #37
0
    def compose(self, data, arg_space):
        """
        Uses a composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (arg1, arg2, composed_phrase). arg1 and arg2 are the
            elements to be composed and composed_phrase is the string associated
            to their composition.

            arg_space: argument space(s). Space object or a tuple of two
            Space objects (e.g. my_space, or (my_space1, my_space2)).
            If two spaces are provided, arg1 elements of data are
            interpreted in space1, and arg2 in space2.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        arg1_space, arg2_space = self.extract_arg_spaces(arg_space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(data,
                                                                     (arg1_space.row2id,
                                                                      arg2_space.row2id,
                                                                      None))
        
        # we try to achieve at most MAX_MEM_OVERHEAD*phrase_space memory overhead
        # the /3.0 is needed
        # because the composing data needs 3 * len(train_data) memory (arg1 vector, arg2 vector, phrase vector)
        chunk_size = int(max(arg1_space.cooccurrence_matrix.shape[0],arg2_space.cooccurrence_matrix.shape[0],len(phrase_list))
                          * self.MAX_MEM_OVERHEAD / 3.0) + 1
        
        composed_mats = []
        for i in range(int(math.ceil(len(arg1_list) / float(chunk_size)))):
            beg, end = i*chunk_size, min((i+1)*chunk_size, len(arg1_list))

            arg1_mat = arg1_space.get_rows(arg1_list[beg:end])
            arg2_mat = arg2_space.get_rows(arg2_list[beg:end])

            [arg1_mat, arg2_mat] = resolve_type_conflict([arg1_mat, arg2_mat],
                                                                    DenseMatrix)
            composed_mat = self._compose(arg1_mat, arg2_mat)
            composed_mats.append(composed_mat)
        
        composed_phrase_mat = composed_mat.nary_vstack(composed_mats)
        
        if self.composed_id2column is None:
            self.composed_id2column = self._build_id2column(arg1_space, arg2_space)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3, "Composed total data points:%s" % arg1_mat.shape[0])
        log.print_matrix_info(logger, composed_phrase_mat, 4,
                              "Resulted (composed) semantic space::")
        log.print_time_info(logger, time.time(), start, 2)
        
        return Space(composed_phrase_mat, phrase_list, self.composed_id2column)
Example #38
0
 def test_init1(self):
     for (m, id2row, id2col, row2id, col2id, ops) in self.init_test_cases:
         space_ = Space(m, id2row, id2col)
         self.assertIs(m, space_.cooccurrence_matrix)
         self.assertIs(id2row, space_.id2row)
         self.assertIs(id2col, space_.id2column)
         self.assertDictEqual(row2id, space_.row2id)
         self.assertDictEqual(col2id, space_.column2id)
         self.assertListEqual([], space_.operations)
    def compose(self, data, arg_space):
        """
        Uses a lexical function composition model to compose elements.

        Args:
            data: data to be composed. List of tuples, each containing 3
            strings: (function_word, arg, composed_phrase). function_word and
            arg are the elements to be composed and composed_phrase is the
            string associated to their composition. function_word elements
            are interpreted in self.function_space.

            arg_space: argument space, of type Space. arg elements of data are
            interpreted in this space.

        Returns:
            composed space: a new object of type Space, containing the
            phrases obtained through composition.

        """
        start = time.time()

        assert_is_instance(arg_space, Space)
        arg1_list, arg2_list, phrase_list = self.valid_data_to_lists(
            data, (self._function_space.row2id, arg_space.row2id, None))

        composed_vec_list = []
        for i in range(len(arg1_list)):
            arg1_vec = self._function_space.get_row(arg1_list[i])
            arg2_vec = arg_space.get_row(arg2_list[i])

            matrix_type = get_type_of_largest([arg1_vec, arg2_vec])
            [arg1_vec, arg2_vec] = resolve_type_conflict([arg1_vec, arg2_vec],
                                                         matrix_type)

            composed_ph_vec = self._compose(arg1_vec, arg2_vec,
                                            self._function_space.element_shape)

            composed_vec_list.append(composed_ph_vec)

        result_element_shape = self._function_space.element_shape[0:-1]
        composed_ph_mat = composed_ph_vec.nary_vstack(composed_vec_list)

        log.print_name(logger, self, 1, "\nComposed with composition model:")
        log.print_info(logger, 3,
                       "Composed total data points:%s" % len(arg1_list))
        log.print_info(
            logger, 3,
            "Functional shape of the resulted (composed) elements:%s" %
            (result_element_shape, ))
        log.print_matrix_info(logger, composed_ph_mat, 4,
                              "Resulted (composed) semantic space:")
        log.print_time_info(logger, time.time(), start, 2)

        return Space(composed_ph_mat,
                     phrase_list,
                     self.composed_id2column,
                     element_shape=result_element_shape)
Example #40
0
    def setUp(self):
        self.m1 = np.array([[1, 2, 3]])
        self.row1 = ["a"]
        self.ft1 = ["f1", "f2", "f3"]
        self.space1 = Space(DenseMatrix(self.m1), self.row1, self.ft1)

        self.m2 = np.array([[4, 2, 6]])
        self.row2 = ["b"]
        self.row3 = ["a", "b", "c"]

        self.x = np.mat([[1, 2, 3], [2, 4, 6], [4, 675, 43]])
        self.us = np.mat([[2.19272110e+00, 3.03174768e+00],
                          [4.38544220e+00, 6.06349536e+00],
                          [6.76369708e+02, -4.91431927e-02]])
        self.us2 = np.mat([[2.19272110e+00], [4.38544220e+00],
                           [6.76369708e+02]])

        self.space2 = Space(DenseMatrix(self.x), self.row3, self.ft1)
Example #41
0
    def test_simple_sparse(self):

        bcs.main([
            "build_core_space.py", "-l", self.dir_ + "log1.txt", "-i",
            self.dir_ + "mat1", "-o", self.dir_, "--input_format", "sm",
            "--output_format", "sm"
        ])

        s1 = Space.build(data=self.dir_ + "mat1.sm",
                         cols=self.dir_ + "mat1.cols",
                         format="sm")
        s2 = Space.build(data=self.dir_ + "CORE_SS.mat1.sm",
                         cols=self.dir_ + "CORE_SS.mat1.cols",
                         format="sm")
        s3 = io_utils.load(self.dir_ + "CORE_SS.mat1.pkl", Space)

        self._test_equal_spaces_sparse(s1, s2)
        self._test_equal_spaces_sparse(s1, s3)
Example #42
0
 def test_init4(self):
     for (m, id2row, id2col, row2id, col2id, ops) in self.init_test_cases:
         space_ = Space(m, id2row, id2col, row2id, col2id, operations = ops)
         self.assertIs(m, space_.cooccurrence_matrix)
         self.assertIs(id2row, space_.id2row)
         self.assertIs(id2col, space_.id2column)
         self.assertIs(row2id, space_.row2id)
         self.assertIs(col2id, space_.column2id)
         self.assertIs(ops, space_.operations)
Example #43
0
    def test_simple_sparse(self):

        bps.main(["build_peripheral_space.py",
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "mat1",
                  "-o", self.dir_,
                  "-c", self.dir_ + "CORE_SS.mat1.pkl",
                  "--input_format", "sm",
                  "--output_format", "sm"
                  ])

        s1 = Space.build(data=self.dir_ + "mat1.sm",
                         cols=self.dir_ + "mat1.cols",
                         format="sm")
        s2 = Space.build(data=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.sm",
                         cols=self.dir_ + "PER_SS.mat1.CORE_SS.mat1.cols",
                         format="sm")

        self._test_equal_spaces_sparse(s1, s2)
Example #44
0
    def to_dissect_core_space(self):
        """
        Converts this object to a composes.semantic_space.space.Space
        :rtype: composes.semantic_space.space.Space
        """
        from composes.matrix.sparse_matrix import SparseMatrix
        from composes.semantic_space.space import Space

        mat, cols, rows = self.to_sparse_matrix()
        mat = SparseMatrix(mat)
        s = Space(mat, rows, cols)

        # test that the mapping from string to its vector has not been messed up
        for i in range(min(10, len(self))):
            s1 = s.get_row(rows[i]).mat
            s2 = self.v.transform(dict(self[rows[i]]))
            # sparse matrices do not currently support equality testing
            assert abs(s1 - s2).nnz == 0

        return s
Example #45
0
 def fit(self, train_pairs, verbose=False):
     AdditiveModel.fit(self, train_pairs, verbose=verbose)
     if verbose:
         print 'fit: Fitting a weighted additive model on %d pairs' % (len(train_pairs))
     # First, we embed the derived vector into the original space (by simply adding a row)
     vec_space = Space(self.diff_vector, ['pattern_vector'], [])
     self.new_space = Space.vstack(self.space, vec_space)
     #  class is designed to be run on a dataset with different function words (==patterns).
     # We use a dummy function word here.
     train_pairs_ext = [(base, 'pattern_vector', derived) for (base, derived) in train_pairs]
     self.weighted_additive.train(train_pairs_ext, self.new_space, self.new_space)
def build_unigram_space() :
	unigram_space = Space.build(data = args.function[3],
                       	       rows = args.function[2],
                       	       cols = args.function[1],
                       	       format = "sm")
	 
	ppmi_space = ppmi(unigram_space)
	ppmi_norm_space = norm(ppmi_space)
	ppmi_norm_svd_space = svd(ppmi_norm_space)
	
	save_space(ppmi_norm_svd_space, "unigrams_space") 
	return ppmi_norm_svd_space
Example #47
0
    def write_pkl(self):
        """
        Create spaces from co-occurrence counts in sparse format (.sm)
        """

        # For direction DE-EN
        my_space_1 = Space.build(
            data=OUTPUT_FILE_DE_DE_EN_SM, rows=OUTPUT_FILE_DE_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm"
        )

        # For direction EN-DE
        my_space_2 = Space.build(
            data=OUTPUT_FILE_EN_EN_DE_SM, rows=OUTPUT_FILE_EN_WORDS_ROW, cols=OUTPUT_FILE_DE_EN_WORDS_COL, format="sm"
        )

        # Save the space objects in pickle format
        io_utils.save(my_space_1, OUTPUT_FILE_DE_DE_EN_PKL)
        io_utils.save(my_space_2, OUTPUT_FILE_EN_EN_DE_PKL)

        print >> stderr, "Pickle file 1 written out:", OUTPUT_FILE_DE_DE_EN_PKL
        print >> stderr, "Pickle file 2 written out:", OUTPUT_FILE_EN_EN_DE_PKL
Example #48
0
    def test_simple_load(self):

        #trained = io_utils.load(self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl")
        #new_space = trained.function_space

        ac.main(["apply_composition.py",
                  "-l", self.dir_ + "log1.txt",
                  "-i", self.dir_ + "an_train_data.txt",
                  "-o", self.dir_,
                  "--load_model", self.dir_ + "TRAINED_COMP_MODEL.lexical_func.an_train_data.txt.pkl",
                  "-a", self.dir_ + "CORE_SS.N_mat.pkl",
                  "--output_format", "dm"
                  ]
                )

        sp1 = Space.build(data=self.dir_ + "COMPOSED_SS.LexicalFunction.an_train_data.txt.dm",
                         format="dm")

        sp2 = Space.build(data=self.dir_ + "AN_mat.dm",
                         format="dm")

        self._test_equal_spaces_dense(sp1, sp2)
Example #49
0
def inspect_representations(path_composed_emb, output_path):
    print('Inspecting representations...')
    composed_space = Space.build(data=path_composed_emb, format='dm')
    f = codecs.open(output_path, 'w', 'utf8')
    word_list=[w for w in composed_space.get_row2id()]
    for j, w in enumerate(word_list):
        if j < 1000:
            neighbours = composed_space.get_neighbours(w, 10, CosSimilarity())

            f.write('Neighbours for ' + w + '\n')
            f.write("\n".join('%s %.6f' % x for x in neighbours))
            f.write('\n----------------------------\n')
    f.close()
 def test_simple_nmf(self):
     
     bcs.main(["build_core_space.py", 
       "-l", self.dir_ + "log_nmf.txt",
       "-i", self.dir_ + "mat3",
       "-w", "raw",
       "-r", "nmf_2",
       "-o", self.dir_,
       "--input_format", "dm",
       "--output_format", "dm"
       ])
       
     s1 = Space.build(data = self.dir_ + "CORE_SS.mat3.raw.nmf_2.dm", format="dm")  
     self.assertEqual(s1.cooccurrence_matrix.mat.shape, (3,2))
Example #51
0
def train_all_spaces(core_space, an_dn_space, pn_space, sv_space, vo_space):
    core_space = core_space.apply(RowNormalization())
    print "train adj, det"
    a_d_space = train_one_space(core_space, an_dn_space, 0, 3)
    print "train prep"
    prep_space = train_one_space(core_space, pn_space, 1, 3)
    print "train vo"
    v_obj_space = train_one_space(core_space, vo_space, 0, 4)
    print "train sv"
    v_subj_space = train_one_space(core_space, sv_space, 1, 4)
    
    new_v_obj_rows = [row + ".objmat" for row in v_obj_space.id2row]
    v_obj_space._id2row = new_v_obj_rows
    v_obj_space._row2id = list2dict(new_v_obj_rows)
    
    new_v_subj_rows = [row + ".subjmat" for row in v_subj_space.id2row]
    v_subj_space._id2row = new_v_subj_rows
    v_subj_space._row2id = list2dict(new_v_subj_rows)
    
    all_mat_space = Space.vstack(a_d_space, prep_space)
    all_mat_space = Space.vstack(v_obj_space, all_mat_space)
    all_mat_space = Space.vstack(v_subj_space, all_mat_space)
    return all_mat_space
Example #52
0
    def test_build_data_row(self):
        test_cases = [("data1", "row1.row", ["red"], ["car", "man"],
                       np.mat([[3,5]]), np.mat([[3,5]])),
                      ("data2", "row1.row",["red"], ["car"],
                       np.mat([[3]]), np.mat([[3]])),
                      ("data3", "row2.row", ["blue", "red"], ["car", "man"],
                       np.mat([[0,6],[15,0]]), np.mat([[0,6],[5,0]])),
                      ("data3", "row3.row", ["blue", "red"], ["car", "man"],
                       np.mat([[0,6],[15,0]]), np.mat([[0,6],[5,0]])),
                      ("data7", "row2.row", ["blue", "red"], ["car"],
                       np.mat([[0],[0]]), np.mat([[0],[0]])),
                      ]

        for data_file, row_file, rows, cols, smat, dmat in test_cases:
            row_file = self.dir_ + row_file

            data_file1 = self.dir_ + data_file + ".sparse"

            sp = Space.build(data=data_file1,
                             rows= row_file,
                             cols= self.dir_ + data_file + ".cols",
                             format="sm")
            self.assertListEqual(rows, sp.id2row)
            self.assertListEqual(cols, sp.id2column)

            self.assertIsInstance(sp.cooccurrence_matrix, SparseMatrix)
            np.testing.assert_array_equal(smat,
                                          sp.cooccurrence_matrix.mat.todense())

            data_file2 = self.dir_ + data_file + ".dense"

            sp = Space.build(data=data_file2, rows= row_file, format="dm")
            self.assertListEqual(rows, sp.id2row)
            self.assertListEqual([], sp.id2column)

            self.assertIsInstance(sp.cooccurrence_matrix, DenseMatrix)
            np.testing.assert_array_equal(dmat, sp.cooccurrence_matrix.mat)
def build_spaces(in_file_prefix, in_format, out_dir, out_format, weightings, 
                 selections, reductions, normalizations, is_gz):

    in_file_descr = "CORE_SS." + in_file_prefix.split("/")[-1]
    data_file = '%s.%s' % (in_file_prefix, in_format)
    
    if not in_format in ("sm", "dm", "pkl"):
        raise ValueError("Invalid input format:%s" % in_format) 
    
    if in_format == "pkl":
        space = io_utils.load(data_file, Space)
    else:
        if is_gz:
            data_file = '%s.gz' % data_file    
        row_file = '%s.rows' % (in_file_prefix)
        column_file = '%s.cols' % (in_file_prefix)
        if not os.path.exists(row_file):
            row_file = None
        if not os.path.exists(column_file):
            if in_format == "sm":
                raise ValueError("Column file: %s needs to be provided!" 
                                 % column_file)
            column_file = None
            
        print "Building matrix..."   
        space = Space.build(data=data_file, rows=row_file, cols=column_file, 
                            format=in_format)
 
    for w in weightings:
        w_space = apply_weighting(space, w)
                
        for s in selections:
            s_space = apply_selection(w_space, s)

            for r in reductions:
                r_space = apply_reduction(s_space, r)
                
                for n in normalizations:
                    n_space = apply_normalization(r_space, n)
                    
                    print "Printing..."
                    print_space(n_space, out_dir, [in_file_descr, w, s, r, n], out_format)
from composes.semantic_space.space import Space
from composes.utils import io_utils
from composes.transformation.scaling.ppmi_weighting import PpmiWeighting
from composes.transformation.scaling.row_normalization import RowNormalization
from composes.transformation.dim_reduction.svd import Svd;

import sys

#create a space from co-occurrence counts in sparse format
my_space = Space.build(data = "../data/"+sys.argv[1]+".sm",
                       rows = "../data/"+sys.argv[1]+".rows",
                       cols = "../data/"+sys.argv[1]+".cols",
                       format = "sm")
                       
my_space = my_space.apply(PpmiWeighting())
my_space = my_space.apply(RowNormalization())

#apply svd reduction
my_space = my_space.apply(Svd(1500))

    
#export the space in dense format and pkl format
my_space.export("../spaces/"+sys.argv[1], format = "dm")
io_utils.save(my_space, "../spaces/"+sys.argv[1]+".pkl")
Example #55
0
from subprocess import Popen, PIPE
import os
import time


usage = """
Usage: python dissect.py dissect_format_file_name

dissect_format_file_name: path to a file containing dissect format
"""

CMD_EXTRACTOR_SCRIPT = '~/Programming/terminology_extractor/extract_patterns.py'
file_name = sys.argv[1]

my_space = Space.build(data = file_name+".sm",
                       rows = file_name+".rows",
                       cols = file_name+".cols",
                       format = "sm")

my_space = my_space.apply(PpmiWeighting())
# print my_space.get_sim("spain", "netherlands", CosSimilarity())
# print my_space.get_neighbours('parenchymopbouw', 4, CosSimilarity())
# print my_space.get_neighbours('pension-n', 4, CosSimilarity())
# print my_space.id2row


def prettify(elem):
    """
    Return a pretty-printed XML string for the Element.
    """
    rough_string = ElementTree.tostring(elem, 'utf-8')
    reparsed = minidom.parseString(rough_string)