def __iter__(self):
        """Iterate over all corpus.

        Yields
        ------
        (prev_id, document) : (int, list of (int, number)
            Number of document and document in BoW format.

        Notes
        -----
        Total number of vectors returned is always equal to the
        number of rows specified in the header; empty documents are inserted and
        yielded where appropriate, even if they are not explicitly stored in the
        Matrix Market file.

        """
        with utils.file_or_filename(self.input) as lines:
            self.skip_headers(lines)

            previd = -1
            for line in lines:
                docid, termid, val = utils.to_unicode(
                    line).split()  # needed for python3
                if not self.transposed:
                    termid, docid = docid, termid
                # -1 because matrix market indexes are 1-based => convert to 0-based
                docid, termid, val = int(docid) - 1, int(termid) - 1, float(
                    val)
                assert previd <= docid, "matrix columns must come in ascending order"
                if docid != previd:
                    # change of document: return the document read so far (its id is prevId)
                    if previd >= 0:
                        yield previd, document  # noqa:F821

                    # return implicit (empty) documents between previous id and new id
                    # too, to keep consistent document numbering and corpus length
                    for previd in xrange(previd + 1, docid):
                        yield previd, []

                    # from now on start adding fields to a new document, with a new id
                    previd = docid
                    document = []

                document.append((
                    termid,
                    val,
                ))  # add another field to the current document

        # handle the last document, as a special case
        if previd >= 0:
            yield previd, document

        # return empty documents between the last explicit document and the number
        # of documents as specified in the header
        for previd in xrange(previd + 1, self.num_docs):
            yield previd, []
    def accuracy(self, questions, case_insensitive=False, predictor_method=0):
        correct = 0
        incorrect = 0
        sections, section = [], None
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                # a new section starts => store the old section
                if section:
                    sections.append(section)
                section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []}
            else:
                if not section:
                    raise ValueError("missing section header before line #%i in %s" % (line_no, questions))
                try:
                    if case_insensitive:
                        a, b, c, expected = [word.upper() for word in line.split()]
                    else:
                        a, b, c, expected = [word for word in line.split()]
                except ValueError:
                    #print("skipping invalid line #%i in %s", line_no, questions)
                    continue
                predicted = [None, None]
                if predictor_method == 0:
                    #print("Evaluation method: Majority vote")
                    predicted = simple_ensamble.predict_majority_vote(self, positive_word_list=[b, c],
                                                                        negative_word_list=[a],
                                                                        top_n_words=1)
                elif predictor_method == 1:
                    #print("Evaluation method: Sumed most probable")
                    predicted = simple_ensamble.predict_sum_proberbility(self, positive_word_list=[b, c],
                                                                           negative_word_list=[a],
                                                                           top_n_words=1)
                elif predictor_method == 2:
                    #print("Evaluation method: weighted sum porberbilities")
                    predicted = simple_ensamble.predict_weighted_sum_proberbility(self, positive_word_list=[b, c],
                                                                                    negative_word_list=[a],
                                                                                    top_n_words=1)
                else:
                    raise ValueError("incorrect argument type for predictor_method")

                if predicted[0] == expected:
                    correct += 1
                    section['correct'].append((a, b, c, expected))
                else:
                    incorrect +=1
                    section['incorrect'].append((a, b, c, expected))
        if section:
            # store the last section, too
            sections.append(section)

        print(correct)
        print(incorrect)
        #sections.append(total)
        return sections
    def load_from_text(fname):
        """Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file.
        Mirror function to :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`.

        Parameters
        ----------
        fname: str
            Path to file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`.

        See Also
        --------
        :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`

        Examples
        --------
        >>> from gensim.corpora import Dictionary
        >>> from gensim.test.utils import get_tmpfile
        >>>
        >>> tmp_fname = get_tmpfile("dictionary")
        >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]]
        >>>
        >>> dct = Dictionary(corpus)
        >>> dct.save_as_text(tmp_fname)
        >>>
        >>> loaded_dct = Dictionary.load_from_text("testdata")
        >>> assert dct.token2id == loaded_dct.token2id

        """
        result = Dictionary()
        with utils.smart_open(fname) as f:
            for lineno, line in enumerate(f):
                line = utils.to_unicode(line)
                if lineno == 0:
                    if line.strip().isdigit():
                        # Older versions of save_as_text may not write num_docs on first line.
                        result.num_docs = int(line.strip())
                        continue
                    else:
                        logging.warning(
                            "Text does not contain num_docs on the first line."
                        )
                try:
                    wordid, word, docfreq = line[:-1].split('\t')
                except Exception:
                    raise ValueError("invalid line in dictionary file %s: %s" %
                                     (fname, line.strip()))
                wordid = int(wordid)
                if word in result.token2id:
                    raise KeyError(
                        'token %s is defined as ID %d and as ID %d' %
                        (word, wordid, result.token2id[word]))
                result.token2id[word] = wordid
                result.dfs[wordid] = int(docfreq)
        return result
    def __init__(self, input, transposed=True):
        """

        Parameters
        ----------
        input : {str, file-like object}
            Path to input file or file-like object (in Matrix Market format).
        transposed : bool, optional
            "Orientation" of document. By default, documents should be rows of the matrix,
            otherwise, needed to set this to False

        """
        logger.info("initializing corpus reader from %s", input)
        self.input, self.transposed = input, transposed
        with utils.file_or_filename(self.input) as lines:
            try:
                header = utils.to_unicode(next(lines)).strip()
                if not header.lower().startswith(
                        '%%matrixmarket matrix coordinate real general'):
                    raise ValueError(
                        "File %s not in Matrix Market format with coordinate real general; instead found: \n%s"
                        % (self.input, header))
            except StopIteration:
                pass

            self.num_docs = self.num_terms = self.num_nnz = 0
            for lineno, line in enumerate(lines):
                line = utils.to_unicode(line)
                if not line.startswith('%'):
                    self.num_docs, self.num_terms, self.num_nnz = (
                        int(x) for x in line.split())
                    if not self.transposed:
                        self.num_docs, self.num_terms = self.num_terms, self.num_docs
                    break

        logger.info(
            "accepted corpus with %i documents, %i features, %i non-zero entries",
            self.num_docs, self.num_terms, self.num_nnz)
    def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000,
                            case_insensitive=False, dummy4unknown=False, similarity_model_type="0"):

        similarity_gold = []
        similarity_model = []
        for line_no, line in enumerate(utils.smart_open(pairs)):
            line = utils.to_unicode(line)
            if line.startswith('#'):
                # May be a comment
                continue
            else:
                try:
                    if case_insensitive:
                        a, b, sim = [word.lower() for word in line.split(delimiter)]
                    else:
                        a, b, sim = [word for word in line.split(delimiter)]
                    sim = float(sim)
                except (ValueError, TypeError):
                    # logger.info('skipping invalid line #%d in %s', line_no, pairs)
                    continue

                similarity_gold.append(sim)  # Similarity from the dataset
                if (similarity_model_type == 0):
                    similarity_model.append(
                        simple_ensamble.similarity_avg_proberbility(self, a, b))  # Similarity from the model
                elif (similarity_model_type == 1):
                    if (self.weight_list == []):
                        raise ValueError("No weights specified for ensamble model")
                    else:
                        similarity_model.append(simple_ensamble.similarity_weighted_avg_proberbility(self, a, b))
                else:
                    raise ValueError("incorrect argument type for predictor_method")

        spearman = stats.spearmanr(similarity_gold, similarity_model)
        pearson = stats.pearsonr(similarity_gold, similarity_model)

        # logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1])
        # logger.debug(
        #    'Spearman rank-order correlation coefficient against %s: %f with p-value %f',
        #    pairs, spearman[0], spearman[1]
        # )
        # logger.debug('Pairs with unknown words: %d', oov)
        # self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs)
        print(pearson)
        print(spearman)
        return pearson, spearman
    def get_human_similarities_results(self, test_set):
        dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        test_set = dir_path + '/Code/TestingSet/' + test_set
        results = []
        for line_no, line in enumerate(utils.smart_open(test_set)):
            line = utils.to_unicode(line)
            if line.startswith('#'):
                # May be a comment
                continue
            else:
                try:

                    a, b, sim = [word.upper() for word in line.split('\t')]

                    sim = float(sim)
                except (ValueError, TypeError):
                    logger.info('skipping invalid line #%d in %s', line_no, test_set)
                    continue

                results.append(sim)  # Similarity from the dataset
        return results
    def get_expected_acc_results(self, questions):
        dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
        questions = dir_path + '/Code/TestingSet/' + questions
        """
                Returns a list of the expected results from an accuracy test

        """
        results = []
        for line_no, line in enumerate(utils.smart_open(questions)):
            # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed
            line = utils.to_unicode(line)
            if line.startswith(': '):
                continue
            else:
                try:
                    a, b, c, expected = [word.upper() for word in line.split()]
                except ValueError:
                    logger.info("skipping invalid line #%i in %s", line_no, questions)
                    continue

                results.append(expected)
        return results
Exemple #8
0
def _load_word2vec_format(cls,
                          fname,
                          fvocab=None,
                          binary=False,
                          encoding='utf8',
                          unicode_errors='strict',
                          limit=None,
                          datatype=REAL):
    """Load the input-hidden weight matrix from the original C word2vec-tool format.

    Note that the information stored in the file is incomplete (the binary tree is missing),
    so while you can query for word similarity etc., you cannot continue training
    with a model loaded this way.

    Parameters
    ----------
    fname : str
        The file path to the saved word2vec-format file.
    fvocab : str
            Optional file path to the vocabulary.Word counts are read from `fvocab` filename,
            if set (this is the file generated by `-save-vocab` flag of the original C tool).
    binary : bool
        If True, indicates whether the data is in binary word2vec format.
    encoding : str
        If you trained the C model using non-utf8 encoding for words, specify that
        encoding in `encoding`.
    unicode_errors : str
        default 'strict', is a string suitable to be passed as the `errors`
        argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source
        file may include word tokens truncated in the middle of a multibyte unicode character
        (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help.
    limit : int
        Sets a maximum number of word-vectors to read from the file. The default,
        None, means read all.
    datatype : :class: `numpy.float*`
        (Experimental) Can coerce dimensions to a non-default float type (such
        as np.float16) to save memory. (Such types may result in much slower bulk operations
        or incompatibility with optimized routines.)

    Returns
    -------
    :obj: `cls`
        Returns the loaded model as an instance of :class: `cls`.

    """
    from gensim.models.keyedvectors import Vocab
    counts = None
    if fvocab is not None:
        logger.info("loading word counts from %s", fvocab)
        counts = {}
        with utils.smart_open(fvocab) as fin:
            for line in fin:
                word, count = utils.to_unicode(line).strip().split()
                counts[word] = int(count)

    logger.info("loading projection weights from %s", fname)
    with utils.smart_open(fname) as fin:
        header = utils.to_unicode(fin.readline(), encoding=encoding)
        vocab_size, vector_size = (int(x) for x in header.split()
                                   )  # throws for invalid file format
        if limit:
            vocab_size = min(vocab_size, limit)
        result = cls(vector_size)
        result.vector_size = vector_size
        result.vectors = zeros((vocab_size, vector_size), dtype=datatype)

        def add_word(word, weights):
            word_id = len(result.vocab)
            if word in result.vocab:
                logger.warning(
                    "duplicate word '%s' in %s, ignoring all but first", word,
                    fname)
                return
            if counts is None:
                # most common scenario: no vocab file given. just make up some bogus counts, in descending order
                result.vocab[word] = Vocab(index=word_id,
                                           count=vocab_size - word_id)
            elif word in counts:
                # use count from the vocab file
                result.vocab[word] = Vocab(index=word_id, count=counts[word])
            else:
                # vocab file given, but word is missing -- set count to None (TODO: or raise?)
                logger.warning(
                    "vocabulary file is incomplete: '%s' is missing", word)
                result.vocab[word] = Vocab(index=word_id, count=None)
            result.vectors[word_id] = weights
            result.index2word.append(word)

        if binary:
            binary_len = dtype(REAL).itemsize * vector_size
            for _ in xrange(vocab_size):
                # mixed text and binary: read text first, then binary
                word = []
                while True:
                    ch = fin.read(1)
                    if ch == b' ':
                        break
                    if ch == b'':
                        raise EOFError(
                            "unexpected end of input; is count incorrect or file otherwise damaged?"
                        )
                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
                        word.append(ch)
                word = utils.to_unicode(b''.join(word),
                                        encoding=encoding,
                                        errors=unicode_errors)
                weights = fromstring(fin.read(binary_len), dtype=REAL)
                add_word(word, weights)
        else:
            for line_no in xrange(vocab_size):
                line = fin.readline()
                if line == b'':
                    raise EOFError(
                        "unexpected end of input; is count incorrect or file otherwise damaged?"
                    )
                parts = utils.to_unicode(line.rstrip(),
                                         encoding=encoding,
                                         errors=unicode_errors).split(" ")
                if len(parts) != vector_size + 1:
                    raise ValueError(
                        "invalid vector on line %s (is this really the text format?)"
                        % line_no)
                word, weights = parts[0], [REAL(x) for x in parts[1:]]
                add_word(word, weights)
    if result.vectors.shape[0] != len(result.vocab):
        logger.info(
            "duplicate words detected, shrinking matrix size from %i to %i",
            result.vectors.shape[0], len(result.vocab))
        result.vectors = ascontiguousarray(result.vectors[:len(result.vocab)])
    assert (len(result.vocab), vector_size) == result.vectors.shape

    logger.info("loaded %s matrix from %s", result.vectors.shape, fname)
    return result
    def from_corpus(corpus, id2word=None):
        """Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus.

        Parameters
        ----------
        corpus : iterable of iterable of (int, number)
            Corpus in BoW format.
        id2word : dict of (int, object)
            Mapping id -> word. If None, the mapping `id2word[word_id] = str(word_id)` will be used.

        Notes
        -----
        This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original
        text corpus. This method will scan the term-document count matrix for all word ids that appear in it,
        then construct :class:`~gensim.corpora.dictionary.Dictionary` which maps each `word_id -> id2word[word_id]`.
        `id2word` is an optional dictionary that maps the `word_id` to a token.
        In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used.

        Returns
        -------
        :class:`~gensim.corpora.dictionary.Dictionary`
            Inferred dictionary from corpus.

        Examples
        --------
        >>> from gensim.corpora import Dictionary
        >>>
        >>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []]
        >>> dct = Dictionary.from_corpus(corpus)
        >>> len(dct)
        3

        """
        result = Dictionary()
        max_id = -1
        for docno, document in enumerate(corpus):
            if docno % 10000 == 0:
                logger.info("adding document #%i to %s", docno, result)
            result.num_docs += 1
            result.num_nnz += len(document)
            for wordid, word_freq in document:
                max_id = max(wordid, max_id)
                result.num_pos += word_freq
                result.dfs[wordid] = result.dfs.get(wordid, 0) + 1

        if id2word is None:
            # make sure length(result) == get_max_id(corpus) + 1
            result.token2id = {unicode(i): i for i in xrange(max_id + 1)}
        else:
            # id=>word mapping given: simply copy it
            result.token2id = {
                utils.to_unicode(token): idx
                for idx, token in iteritems(id2word)
            }
        for idx in itervalues(result.token2id):
            # make sure all token ids have a valid `dfs` entry
            result.dfs[idx] = result.dfs.get(idx, 0)

        logger.info("built %s from %i documents (total %i corpus positions)",
                    result, result.num_docs, result.num_pos)
        return result