def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure, space_files):
    sim_dict = {"cos": CosSimilarity(),
                "lin": LinSimilarity(),
                "dot_prod": DotProdSimilarity(),
                "euclidean": EuclideanSimilarity()}
    
    if not sim_measure in sim_dict:
        raise ValueError("Similarity measure:%s not defined" % sim_measure)
    
    space = io_utils.load(space_files[0], Space)
    space2 = None
    space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])
    if len(space_files) == 2:
        space2 = io_utils.load(space_files[1], Space)
        space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1])
        
    sim = sim_dict[sim_measure]
    
    descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr])
    out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
    io_utils.create_parent_directories(out_file)
        
    data = io_utils.read_list(in_file)

    print "Computing neighbours: %s" % sim_measure 
    with open(out_file,"w") as out_stream:
        for word in data:
            out_stream.write("%s\n" % word)
            result = space.get_neighbours(word, no_neighbours, sim, space2)
            for neighbour, neighbour_sim in result:
                out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim)) 
Esempio n. 2
0
def compute_neighbours(in_file, no_neighbours, out_dir, sim_measure,
                       space_files):
    sim_dict = {
        "cos": CosSimilarity(),
        "lin": LinSimilarity(),
        "dot_prod": DotProdSimilarity(),
        "euclidean": EuclideanSimilarity()
    }

    if not sim_measure in sim_dict:
        raise ValueError("Similarity measure:%s not defined" % sim_measure)

    space = io_utils.load(space_files[0], Space)
    space2 = None
    space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])
    if len(space_files) == 2:
        space2 = io_utils.load(space_files[1], Space)
        space_descr = ".".join([space_descr] +
                               space_files[1].split("/")[-1].split(".")[0:-1])

    sim = sim_dict[sim_measure]

    descr = ".".join(["NEIGHBOURS", in_file.split("/")[-1], space_descr])
    out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
    io_utils.create_parent_directories(out_file)

    data = io_utils.read_list(in_file)

    print("Computing neighbours: %s" % sim_measure)
    with open(out_file, "w") as out_stream:
        for word in data:
            out_stream.write("%s\n" % word)
            result = space.get_neighbours(word, no_neighbours, sim, space2)
            for neighbour, neighbour_sim in result:
                out_stream.write("\t%s %s\n" % (neighbour, neighbour_sim))
Esempio n. 3
0
    def export(self, filename):
        """
        Prints the parameters of the composition model to file.

        Args:
            filename: output filename, string

        Prints the parameters of the compositional model in an appropriate
        format, specific to each model.
        """
        create_parent_directories(filename)
        self._export(filename)
Esempio n. 4
0
def compute_sim(in_file, columns, out_dir, sim_measures, space_files):

    sim_dict = {
        "cos": CosSimilarity(),
        "lin": LinSimilarity(),
        "dot_prod": DotProdSimilarity(),
        "euclidean": EuclideanSimilarity()
    }

    if not len(columns) == 2:
        raise ValueError("Column description unrecognized!")
    col0 = int(columns[0]) - 1
    col1 = int(columns[1]) - 1

    try:
        space = io_utils.load(space_files[0], Space)
    except TypeError:
        warn("Not a Space instance in file: %s" % space_files[0])
        return

    space2 = None
    space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])

    if len(space_files) == 2:
        space2 = io_utils.load(space_files[1], Space)
        space_descr = ".".join([space_descr] +
                               space_files[1].split("/")[-1].split(".")[0:-1])

    descr = ".".join(["SIMS", in_file.split("/")[-1], space_descr])

    for sim_measure in sim_measures:
        print("Computing similarities: %s" % sim_measure)
        if not sim_measure in sim_dict:
            warn("Similarity measure:%s not defined" % sim_measure)
            continue

        sim = sim_dict[sim_measure]
        out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
        io_utils.create_parent_directories(out_file)

        with open(in_file) as in_stream, open(out_file, "w") as out_stream:
            for line in in_stream:
                if not line.strip() == "":
                    elems = line.strip().split()
                    word1 = elems[col0]
                    word2 = elems[col1]

                    predicted_sim = space.get_sim(word1, word2, sim, space2)
                    out_stream.write("%s %s\n" %
                                     (line.strip(), str(predicted_sim)))
Esempio n. 5
0
def compute_sim(in_file, columns, out_dir, sim_measures, space_files):

    sim_dict = {"cos": CosSimilarity(),
                "lin": LinSimilarity(),
                "dot_prod": DotProdSimilarity(),
                "euclidean": EuclideanSimilarity()}

    if not len(columns) == 2:
        raise ValueError("Column description unrecognized!")
    col0 = int(columns[0]) - 1
    col1 = int(columns[1]) - 1

    try:
        space = io_utils.load(space_files[0], Space)
    except TypeError:
        warn("Not a Space instance in file: %s" % space_files[0])
        return

    space2 = None
    space_descr = ".".join(space_files[0].split("/")[-1].split(".")[0:-1])

    if len(space_files) == 2:
        space2 = io_utils.load(space_files[1], Space)
        space_descr = ".".join([space_descr] + space_files[1].split("/")[-1].split(".")[0:-1])

    descr = ".".join(["SIMS", in_file.split("/")[-1], space_descr])

    for sim_measure in sim_measures:
        print "Computing similarities: %s" % sim_measure
        if not sim_measure in sim_dict:
            warn("Similarity measure:%s not defined" % sim_measure)
            continue

        sim = sim_dict[sim_measure]
        out_file = '%s/%s.%s' % (out_dir, descr, sim_measure)
        io_utils.create_parent_directories(out_file)

        with open(in_file) as in_stream, open(out_file,"w") as out_stream:
            for line in in_stream:
                if not line.strip() == "":
                    elems = line.strip().split()
                    word1 = elems[col0]
                    word2 = elems[col1]

                    predicted_sim = space.get_sim(word1, word2, sim, space2)
                    out_stream.write("%s %s\n" % (line.strip(), str(predicted_sim)))
Esempio n. 6
0
 def export(self, file_prefix, **kwargs):
     """
     Exports the current space to disk.
     If the space has no column information, it cannot be exported in 
     sparse format (sm).
     
     Args:
         file_prefix: string, prefix of the files to be exported
         format: string, one of dm/sm
    
     Prints: 
         - matrix in file_prefix.<format> 
         - row elements in file_prefix.<row>
         - col elements in file_prefix.<col>
     
     Raises:
         ValueError: if the space has no column info and "sm" exporting
             is attempted
         NotImplementedError: the space matrix is dense and "sm" exporting
             is attempted
     
     """
     
     start = time.time()
     create_parent_directories(file_prefix)
     format_ = "dm"
     if "format" in kwargs:
         format_ = kwargs["format"]
         if not format_ in ["dm","sm"]:
             raise ValueError("Unrecognized format: %s" %format_)
         elif format_ == "dm":
             print_cooc_mat_dense_format(self.cooccurrence_matrix,
                                         self.id2row, file_prefix)
         else:
             print_cooc_mat_sparse_format(self.cooccurrence_matrix,
                                          self.id2row,
                                          self.id2column, file_prefix)
     self._export_row_column(file_prefix)
     
     log.print_matrix_info(logger, self.cooccurrence_matrix, 1, 
                           "Printed semantic space:")
     log.print_time_info(logger, time.time(), start, 2) 
Esempio n. 7
0
    def export(self, file_prefix, **kwargs):
        """
        Exports the current space to disk.
        If the space has no column information, it cannot be exported in
        sparse format (sm).

        Args:
            file_prefix: string, prefix of the files to be exported
            format: string, one of dm/sm

        Prints:
            - matrix in file_prefix.<format>
            - row elements in file_prefix.<row>
            - col elements in file_prefix.<col>

        Raises:
            ValueError: if the space has no column info and "sm" exporting
                is attempted
            NotImplementedError: the space matrix is dense and "sm" exporting
                is attempted

        """

        start = time.time()
        create_parent_directories(file_prefix)
        format_ = "dm"
        if "format" in kwargs:
            format_ = kwargs["format"]
            if not format_ in ["dm", "sm"]:
                raise ValueError("Unrecognized format: %s" % format_)
            elif format_ == "dm":
                print_cooc_mat_dense_format(self.cooccurrence_matrix,
                                            self.id2row, file_prefix)
            else:
                print_cooc_mat_sparse_format(self.cooccurrence_matrix,
                                             self.id2row, self.id2column,
                                             file_prefix)
        self._export_row_column(file_prefix)

        log.print_matrix_info(logger, self.cooccurrence_matrix, 1,
                              "Printed semantic space:")
        log.print_time_info(logger, time.time(), start, 2)
Esempio n. 8
0
def config_logging(file_name, level = logging.INFO, format_ =""):
    if not file_name is None:
        create_parent_directories(file_name)
        logging.basicConfig(filename=file_name, level=level, format=format_)
        logging.debug("start logging")