def _save_word2vec_format(fname, vocab, vectors, fvocab=None, binary=False, total_vec=None): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. Parameters ---------- fname : str The file path used to save the vectors in vocab : dict The vocabulary of words vectors : numpy.array The vectors to be stored fvocab : str Optional file path used to save the vocabulary binary : bool If True, the data wil be saved in binary word2vec format, else it will be saved in plain text. total_vec : int Optional parameter to explicitly specify total no. of vectors (in case word vectors are appended with document vectors afterwards) """ if not (vocab or vectors): raise RuntimeError("no input") if total_vec is None: total_vec = len(vocab) vector_size = vectors.shape[1] if fvocab is not None: logger.info("storing vocabulary in %s", fvocab) with utils.smart_open(fvocab, 'wb') as vout: for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count): vout.write(utils.to_utf8("%s %s\n" % (word, vocab_.count))) logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) assert (len(vocab), vector_size) == vectors.shape with utils.smart_open(fname, 'wb') as fout: fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) # store in sorted order: most frequent words at the top for word, vocab_ in sorted(iteritems(vocab), key=lambda item: -item[1].count): row = vectors[vocab_.index] if binary: fout.write(utils.to_utf8(word) + b" " + row.tostring()) else: fout.write( utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row))))
def save_as_text(self, fname, sort_by_word=True): """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file. Parameters ---------- fname : str Path to output file. sort_by_word : bool, optional if True - sort by word in lexicographical order. Notes ----- Format:: num_docs id_1[TAB]word_1[TAB]document_frequency_1[NEWLINE] id_2[TAB]word_2[TAB]document_frequency_2[NEWLINE] .... id_k[TAB]word_k[TAB]document_frequency_k[NEWLINE] Warnings -------- Text format should be use for corpus inspection. Use :meth:`~gensim.corpora.dictionary.Dictionary.save` and :meth:`~gensim.corpora.dictionary.Dictionary.load` to store in binary format (pickle) for better performance. See Also -------- :meth:`~gensim.corpora.dictionary.Dictionary.load_from_text` Examples -------- >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import get_tmpfile >>> >>> tmp_fname = get_tmpfile("dictionary") >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] >>> >>> dct = Dictionary(corpus) >>> dct.save_as_text(tmp_fname) >>> >>> loaded_dct = Dictionary.load_from_text("testdata") >>> assert dct.token2id == loaded_dct.token2id """ logger.info("saving dictionary mapping to %s", fname) with utils.smart_open(fname, 'wb') as fout: numdocs_line = "%d\n" % self.num_docs fout.write(utils.to_utf8(numdocs_line)) if sort_by_word: for token, tokenid in sorted(iteritems(self.token2id)): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line))
def accuracy(self, questions, case_insensitive=False, predictor_method=0): correct = 0 incorrect = 0 sections, section = [], None for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} else: if not section: raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) try: if case_insensitive: a, b, c, expected = [word.upper() for word in line.split()] else: a, b, c, expected = [word for word in line.split()] except ValueError: #print("skipping invalid line #%i in %s", line_no, questions) continue predicted = [None, None] if predictor_method == 0: #print("Evaluation method: Majority vote") predicted = simple_ensamble.predict_majority_vote(self, positive_word_list=[b, c], negative_word_list=[a], top_n_words=1) elif predictor_method == 1: #print("Evaluation method: Sumed most probable") predicted = simple_ensamble.predict_sum_proberbility(self, positive_word_list=[b, c], negative_word_list=[a], top_n_words=1) elif predictor_method == 2: #print("Evaluation method: weighted sum porberbilities") predicted = simple_ensamble.predict_weighted_sum_proberbility(self, positive_word_list=[b, c], negative_word_list=[a], top_n_words=1) else: raise ValueError("incorrect argument type for predictor_method") if predicted[0] == expected: correct += 1 section['correct'].append((a, b, c, expected)) else: incorrect +=1 section['incorrect'].append((a, b, c, expected)) if section: # store the last section, too sections.append(section) print(correct) print(incorrect) #sections.append(total) return sections
def load_from_text(fname): """Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file. Mirror function to :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. Parameters ---------- fname: str Path to file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. See Also -------- :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text` Examples -------- >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import get_tmpfile >>> >>> tmp_fname = get_tmpfile("dictionary") >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] >>> >>> dct = Dictionary(corpus) >>> dct.save_as_text(tmp_fname) >>> >>> loaded_dct = Dictionary.load_from_text("testdata") >>> assert dct.token2id == loaded_dct.token2id """ result = Dictionary() with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) if lineno == 0: if line.strip().isdigit(): # Older versions of save_as_text may not write num_docs on first line. result.num_docs = int(line.strip()) continue else: logging.warning( "Text does not contain num_docs on the first line." ) try: wordid, word, docfreq = line[:-1].split('\t') except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: raise KeyError( 'token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result
def __init__(self, fname): """ Parameters ---------- fname : str Path to output file """ self.fname = fname if fname.endswith(".gz") or fname.endswith('.bz2'): raise NotImplementedError( "compressed output not supported with MmWriter") self.fout = utils.smart_open( self.fname, 'wb+') # open for both reading and writing self.headers_written = False
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=False, dummy4unknown=False, similarity_model_type="0"): similarity_gold = [] similarity_model = [] for line_no, line in enumerate(utils.smart_open(pairs)): line = utils.to_unicode(line) if line.startswith('#'): # May be a comment continue else: try: if case_insensitive: a, b, sim = [word.lower() for word in line.split(delimiter)] else: a, b, sim = [word for word in line.split(delimiter)] sim = float(sim) except (ValueError, TypeError): # logger.info('skipping invalid line #%d in %s', line_no, pairs) continue similarity_gold.append(sim) # Similarity from the dataset if (similarity_model_type == 0): similarity_model.append( simple_ensamble.similarity_avg_proberbility(self, a, b)) # Similarity from the model elif (similarity_model_type == 1): if (self.weight_list == []): raise ValueError("No weights specified for ensamble model") else: similarity_model.append(simple_ensamble.similarity_weighted_avg_proberbility(self, a, b)) else: raise ValueError("incorrect argument type for predictor_method") spearman = stats.spearmanr(similarity_gold, similarity_model) pearson = stats.pearsonr(similarity_gold, similarity_model) # logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1]) # logger.debug( # 'Spearman rank-order correlation coefficient against %s: %f with p-value %f', # pairs, spearman[0], spearman[1] # ) # logger.debug('Pairs with unknown words: %d', oov) # self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs) print(pearson) print(spearman) return pearson, spearman
def docbyoffset(self, offset): """Get document at file offset `offset` (in bytes) Parameters ---------- offset : int Offset (in bytes). Returns ------- list of (int, number) Document in BoW format, reached by `offset`. """ # empty documents are not stored explicitly in MM format, so the index marks # them with a special offset, -1. if offset == -1: return [] if isinstance(self.input, string_types): fin, close_fin = utils.smart_open(self.input), True else: fin, close_fin = self.input, False fin.seek(offset) # works for gzip/bz2 input, too previd, document = -1, [] for line in fin: docid, termid, val = line.split() if not self.transposed: termid, docid = docid, termid # -1 because matrix market indexes are 1-based => convert to 0-based docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: if previd >= 0: break previd = docid document.append(( termid, val, )) # add another field to the current document if close_fin: fin.close() return document
def get_human_similarities_results(self, test_set): dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_set = dir_path + '/Code/TestingSet/' + test_set results = [] for line_no, line in enumerate(utils.smart_open(test_set)): line = utils.to_unicode(line) if line.startswith('#'): # May be a comment continue else: try: a, b, sim = [word.upper() for word in line.split('\t')] sim = float(sim) except (ValueError, TypeError): logger.info('skipping invalid line #%d in %s', line_no, test_set) continue results.append(sim) # Similarity from the dataset return results
def get_expected_acc_results(self, questions): dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) questions = dir_path + '/Code/TestingSet/' + questions """ Returns a list of the expected results from an accuracy test """ results = [] for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): continue else: try: a, b, c, expected = [word.upper() for word in line.split()] except ValueError: logger.info("skipping invalid line #%i in %s", line_no, questions) continue results.append(expected) return results
def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL): """Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. Parameters ---------- fname : str The file path to the saved word2vec-format file. fvocab : str Optional file path to the vocabulary.Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). binary : bool If True, indicates whether the data is in binary word2vec format. encoding : str If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. unicode_errors : str default 'strict', is a string suitable to be passed as the `errors` argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source file may include word tokens truncated in the middle of a multibyte unicode character (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. limit : int Sets a maximum number of word-vectors to read from the file. The default, None, means read all. datatype : :class: `numpy.float*` (Experimental) Can coerce dimensions to a non-default float type (such as np.float16) to save memory. (Such types may result in much slower bulk operations or incompatibility with optimized routines.) Returns ------- :obj: `cls` Returns the loaded model as an instance of :class: `cls`. """ from gensim.models.keyedvectors import Vocab counts = None if fvocab is not None: logger.info("loading word counts from %s", fvocab) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s", fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split() ) # throws for invalid file format if limit: vocab_size = min(vocab_size, limit) result = cls(vector_size) result.vector_size = vector_size result.vectors = zeros((vocab_size, vector_size), dtype=datatype) def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: logger.warning( "duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: # most common scenario: no vocab file given. just make up some bogus counts, in descending order result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to None (TODO: or raise?) logger.warning( "vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = Vocab(index=word_id, count=None) result.vectors[word_id] = weights result.index2word.append(word) if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(fin.read(binary_len), dtype=REAL) add_word(word, weights) else: for line_no in xrange(vocab_size): line = fin.readline() if line == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % line_no) word, weights = parts[0], [REAL(x) for x in parts[1:]] add_word(word, weights) if result.vectors.shape[0] != len(result.vocab): logger.info( "duplicate words detected, shrinking matrix size from %i to %i", result.vectors.shape[0], len(result.vocab)) result.vectors = ascontiguousarray(result.vectors[:len(result.vocab)]) assert (len(result.vocab), vector_size) == result.vectors.shape logger.info("loaded %s matrix from %s", result.vectors.shape, fname) return result
def load_binary_data(self, encoding='utf8'): """Loads data from the output binary file created by FastText training""" with utils.smart_open(self.file_name, 'rb') as f: self._load_model_params(f) self._load_dict(f, encoding=encoding) self._load_vectors(f)