Beispiel #1
0
def _save_word2vec_format(fname,
                          vocab,
                          vectors,
                          fvocab=None,
                          binary=False,
                          total_vec=None):
    """Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        Parameters
        ----------
        fname : str
            The file path used to save the vectors in
        vocab : dict
            The vocabulary of words
        vectors : numpy.array
            The vectors to be stored
        fvocab : str
            Optional file path used to save the vocabulary
        binary : bool
            If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
        total_vec :  int
            Optional parameter to explicitly specify total no. of vectors
            (in case word vectors are appended with document vectors afterwards)

        """
    if not (vocab or vectors):
        raise RuntimeError("no input")
    if total_vec is None:
        total_vec = len(vocab)
    vector_size = vectors.shape[1]
    if fvocab is not None:
        logger.info("storing vocabulary in %s", fvocab)
        with utils.smart_open(fvocab, 'wb') as vout:
            for word, vocab_ in sorted(iteritems(vocab),
                                       key=lambda item: -item[1].count):
                vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count)))
    logger.info("storing %sx%s projection weights into %s", total_vec,
                vector_size, fname)
    assert (len(vocab), vector_size) == vectors.shape
    with utils.smart_open(fname, 'wb') as fout:
        fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
        # store in sorted order: most frequent words at the top
        for word, vocab_ in sorted(iteritems(vocab),
                                   key=lambda item: -item[1].count):
            row = vectors[vocab_.index]
            if binary:
                fout.write(utils.to_utf8(word) + b" " + row.tostring())
            else:
                fout.write(
                    utils.to_utf8("%s %s\n" %
                                  (word, ' '.join("%f" % val for val in row))))
    def save_as_text(self, fname, sort_by_word=True):
        """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.

        Parameters
        ----------
        fname : str
            Path to output file.
        sort_by_word : bool, optional
            if True - sort by word in lexicographical order.

        Notes
        -----
        Format::

            num_docs
            id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE]
            id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE]
            ....
            id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE]

        Warnings
        --------
        Text format should be use for corpus inspection. Use :meth:`~gensim.corpora.dictionary.Dictionary.save` and
        :meth:`~gensim.corpora.dictionary.Dictionary.load` to store in binary format (pickle) for better performance.

        See Also
        --------
        :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text`

        Examples
        --------
        >>> from gensim.corpora import Dictionary
        >>> from gensim.test.utils import get_tmpfile
        >>>
        >>> tmp_fname = get_tmpfile("dictionary")
        >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
        >>>
        >>> dct = Dictionary(corpus)
        >>> dct.save_as_text(tmp_fname)
        >>>
        >>> loaded_dct = Dictionary.load_from_text("testdata")
        >>> assert dct.token2id == loaded_dct.token2id

        """
        logger.info("saving dictionary mapping to %s", fname)
        with utils.smart_open(fname, 'wb') as fout:
            numdocs_line = "%d\n" % self.num_docs
            fout.write(utils.to_utf8(numdocs_line))
            if sort_by_word:
                for token, tokenid in sorted(iteritems(self.token2id)):
                    line = "%i\t%s\t%i\n" % (tokenid, token,
                                             self.dfs.get(tokenid, 0))
                    fout.write(utils.to_utf8(line))
            else:
                for tokenid, freq in sorted(iteritems(self.dfs),
                                            key=lambda item: -item[1]):
                    line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
                    fout.write(utils.to_utf8(line))
    def accuracy(self, questions, case_insensitive=False, predictor_method=0):
        correct = 0
        incorrect = 0
        sections, section = [], None
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
            else:
                if not section:
                    raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
                try:
                    if case_insensitive:
                        a, b, c, expected = [word.upper() for word in line.split()]
                    else:
                        a, b, c, expected = [word for word in line.split()]
                except ValueError:
                    #print("skipping invalid line #%i in %s", line_no, questions)
                    continue
                predicted = [None, None]
                if predictor_method == 0:
                    #print("Evaluation method: Majority vote")
                    predicted = simple_ensamble.predict_majority_vote(self, positive_word_list=[b, c],
                                                                        negative_word_list=[a],
                                                                        top_n_words=1)
                elif predictor_method == 1:
                    #print("Evaluation method: Sumed most probable")
                    predicted = simple_ensamble.predict_sum_proberbility(self, positive_word_list=[b, c],
                                                                           negative_word_list=[a],
                                                                           top_n_words=1)
                elif predictor_method == 2:
                    #print("Evaluation method: weighted sum porberbilities")
                    predicted = simple_ensamble.predict_weighted_sum_proberbility(self, positive_word_list=[b, c],
                                                                                    negative_word_list=[a],
                                                                                    top_n_words=1)
                else:
                    raise ValueError("incorrect argument type for predictor_method")

                if predicted[0] == expected:
                    correct += 1
                    section['correct'].append((a, b, c, expected))
                else:
                    incorrect +=1
                    section['incorrect'].append((a, b, c, expected))
        if section:
            # store the last section, too
            sections.append(section)

        print(correct)
        print(incorrect)
        #sections.append(total)
        return sections
    def load_from_text(fname):
        """Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file.
        Mirror function to :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`.

        Parameters
        ----------
        fname: str
            Path to file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`.

        See Also
        --------
        :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`

        Examples
        --------
        >>> from gensim.corpora import Dictionary
        >>> from gensim.test.utils import get_tmpfile
        >>>
        >>> tmp_fname = get_tmpfile("dictionary")
        >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
        >>>
        >>> dct = Dictionary(corpus)
        >>> dct.save_as_text(tmp_fname)
        >>>
        >>> loaded_dct = Dictionary.load_from_text("testdata")
        >>> assert dct.token2id == loaded_dct.token2id

        """
        result = Dictionary()
        with utils.smart_open(fname) as f:
            for lineno, line in enumerate(f):
                line = utils.to_unicode(line)
                if lineno == 0:
                    if line.strip().isdigit():
                        # Older versions of save_as_text may not write num_docs on first line.
                        result.num_docs = int(line.strip())
                        continue
                    else:
                        logging.warning(
                            "Text does not contain num_docs on the first line."
                        )
                try:
                    wordid, word, docfreq = line[:-1].split('\t')
                except Exception:
                    raise ValueError("invalid line in dictionary file %s: %s" %
                                     (fname, line.strip()))
                wordid = int(wordid)
                if word in result.token2id:
                    raise KeyError(
                        'token %s is defined as ID %d and as ID %d' %
                        (word, wordid, result.token2id[word]))
                result.token2id[word] = wordid
                result.dfs[wordid] = int(docfreq)
        return result
    def __init__(self, fname):
        """

        Parameters
        ----------
        fname : str
            Path to output file

        """
        self.fname = fname
        if fname.endswith(".gz") or fname.endswith('.bz2'):
            raise NotImplementedError(
                "compressed output not supported with MmWriter")
        self.fout = utils.smart_open(
            self.fname, 'wb+')  # open for both reading and writing
        self.headers_written = False
    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000,
                            case_insensitive=False, dummy4unknown=False, similarity_model_type="0"):

        similarity_gold = []
        similarity_model = []
        for line_no, line in enumerate(utils.smart_open(pairs)):
            line = utils.to_unicode(line)
            if line.startswith('#'):
                # May be a comment
                continue
            else:
                try:
                    if case_insensitive:
                        a, b, sim = [word.lower() for word in line.split(delimiter)]
                    else:
                        a, b, sim = [word for word in line.split(delimiter)]
                    sim = float(sim)
                except (ValueError, TypeError):
                    # logger.info('skipping invalid line #%d in %s', line_no, pairs)
                    continue

                similarity_gold.append(sim)  # Similarity from the dataset
                if (similarity_model_type == 0):
                    similarity_model.append(
                        simple_ensamble.similarity_avg_proberbility(self, a, b))  # Similarity from the model
                elif (similarity_model_type == 1):
                    if (self.weight_list == []):
                        raise ValueError("No weights specified for ensamble model")
                    else:
                        similarity_model.append(simple_ensamble.similarity_weighted_avg_proberbility(self, a, b))
                else:
                    raise ValueError("incorrect argument type for predictor_method")

        spearman = stats.spearmanr(similarity_gold, similarity_model)
        pearson = stats.pearsonr(similarity_gold, similarity_model)

        # logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1])
        # logger.debug(
        #    'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
        #    pairs, spearman[0], spearman[1]
        # )
        # logger.debug('Pairs with unknown words: %d', oov)
        # self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
        print(pearson)
        print(spearman)
        return pearson, spearman
    def docbyoffset(self, offset):
        """Get document at file offset `offset` (in bytes)

        Parameters
        ----------
        offset : int
            Offset (in bytes).

        Returns
        -------
        list of (int, number)
            Document in BoW format, reached by `offset`.

        """
        # empty documents are not stored explicitly in MM format, so the index marks
        # them with a special offset, -1.
        if offset == -1:
            return []
        if isinstance(self.input, string_types):
            fin, close_fin = utils.smart_open(self.input), True
        else:
            fin, close_fin = self.input, False

        fin.seek(offset)  # works for gzip/bz2 input, too
        previd, document = -1, []
        for line in fin:
            docid, termid, val = line.split()
            if not self.transposed:
                termid, docid = docid, termid
            # -1 because matrix market indexes are 1-based => convert to 0-based
            docid, termid, val = int(docid) - 1, int(termid) - 1, float(val)
            assert previd <= docid, "matrix columns must come in ascending order"
            if docid != previd:
                if previd >= 0:
                    break
                previd = docid

            document.append((
                termid,
                val,
            ))  # add another field to the current document

        if close_fin:
            fin.close()
        return document
    def get_human_similarities_results(self, test_set):
        dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        test_set = dir_path + '/Code/TestingSet/' + test_set
        results = []
        for line_no, line in enumerate(utils.smart_open(test_set)):
            line = utils.to_unicode(line)
            if line.startswith('#'):
                # May be a comment
                continue
            else:
                try:

                    a, b, sim = [word.upper() for word in line.split('\t')]

                    sim = float(sim)
                except (ValueError, TypeError):
                    logger.info('skipping invalid line #%d in %s', line_no, test_set)
                    continue

                results.append(sim)  # Similarity from the dataset
        return results
    def get_expected_acc_results(self, questions):
        dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        questions = dir_path + '/Code/TestingSet/' + questions
        """
                Returns a list of the expected results from an accuracy test

        """
        results = []
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                continue
            else:
                try:
                    a, b, c, expected = [word.upper() for word in line.split()]
                except ValueError:
                    logger.info("skipping invalid line #%i in %s", line_no, questions)
                    continue

                results.append(expected)
        return results
Beispiel #10
0
def _load_word2vec_format(cls,
                          fname,
                          fvocab=None,
                          binary=False,
                          encoding='utf8',
                          unicode_errors='strict',
                          limit=None,
                          datatype=REAL):
    """Load the input-hidden weight matrix from the original C word2vec-tool format.

    Note that the information stored in the file is incomplete (the binary tree is missing),
    so while you can query for word similarity etc., you cannot continue training
    with a model loaded this way.

    Parameters
    ----------
    fname : str
        The file path to the saved word2vec-format file.
    fvocab : str
            Optional file path to the vocabulary.Word counts are read from `fvocab` filename,
            if set (this is the file generated by `-save-vocab` flag of the original C tool).
    binary : bool
        If True, indicates whether the data is in binary word2vec format.
    encoding : str
        If you trained the C model using non-utf8 encoding for words, specify that
        encoding in `encoding`.
    unicode_errors : str
        default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
    limit : int
        Sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
    datatype : :class: `numpy.float*`
        (Experimental) Can coerce dimensions to a non-default float type (such
        as np.float16) to save memory. (Such types may result in much slower bulk operations
        or incompatibility with optimized routines.)

    Returns
    -------
    :obj: `cls`
        Returns the loaded model as an instance of :class: `cls`.

    """
    from gensim.models.keyedvectors import Vocab
    counts = None
    if fvocab is not None:
        logger.info("loading word counts from %s", fvocab)
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    logger.info("loading projection weights from %s", fname)
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = (int(x) for x in header.split()
                                   )  # throws for invalid file format
        if limit:
            vocab_size = min(vocab_size, limit)
        result = cls(vector_size)
        result.vector_size = vector_size
        result.vectors = zeros((vocab_size, vector_size), dtype=datatype)

        def add_word(word, weights):
            word_id = len(result.vocab)
            if word in result.vocab:
                logger.warning(
                    "duplicate word '%s' in %s, ignoring all but first", word,
                    fname)
                return
            if counts is None:
                # most common scenario: no vocab file given. just make up some bogus counts, in descending order
                result.vocab[word] = Vocab(index=word_id,
                                           count=vocab_size - word_id)
            elif word in counts:
                # use count from the vocab file
                result.vocab[word] = Vocab(index=word_id, count=counts[word])
            else:
                # vocab file given, but word is missing -- set count to None (TODO: or raise?)
                logger.warning(
                    "vocabulary file is incomplete: '%s' is missing", word)
                result.vocab[word] = Vocab(index=word_id, count=None)
            result.vectors[word_id] = weights
            result.index2word.append(word)

        if binary:
            binary_len = dtype(REAL).itemsize * vector_size
            for _ in xrange(vocab_size):
                # mixed text and binary: read text first, then binary
                word = []
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    if ch == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?"
                        )
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                        word.append(ch)
                word = utils.to_unicode(b''.join(word),
                                        encoding=encoding,
                                        errors=unicode_errors)
                weights = fromstring(fin.read(binary_len), dtype=REAL)
                add_word(word, weights)
        else:
            for line_no in xrange(vocab_size):
                line = fin.readline()
                if line == b'':
                    raise EOFError(
                        "unexpected end of input; is count incorrect or file otherwise damaged?"
                    )
                parts = utils.to_unicode(line.rstrip(),
                                         encoding=encoding,
                                         errors=unicode_errors).split(" ")
                if len(parts) != vector_size + 1:
                    raise ValueError(
                        "invalid vector on line %s (is this really the text format?)"
                        % line_no)
                word, weights = parts[0], [REAL(x) for x in parts[1:]]
                add_word(word, weights)
    if result.vectors.shape[0] != len(result.vocab):
        logger.info(
            "duplicate words detected, shrinking matrix size from %i to %i",
            result.vectors.shape[0], len(result.vocab))
        result.vectors = ascontiguousarray(result.vectors[:len(result.vocab)])
    assert (len(result.vocab), vector_size) == result.vectors.shape

    logger.info("loaded %s matrix from %s", result.vectors.shape, fname)
    return result
 def load_binary_data(self, encoding='utf8'):
     """Loads data from the output binary file created by FastText training"""
     with utils.smart_open(self.file_name, 'rb') as f:
         self._load_model_params(f)
         self._load_dict(f, encoding=encoding)
         self._load_vectors(f)