def __iter__(self): """Iterate over all corpus. Yields ------ (prev_id, document) : (int, list of (int, number) Number of document and document in BoW format. Notes ----- Total number of vectors returned is always equal to the number of rows specified in the header; empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the Matrix Market file. """ with utils.file_or_filename(self.input) as lines: self.skip_headers(lines) previd = -1 for line in lines: docid, termid, val = utils.to_unicode( line).split() # needed for python3 if not self.transposed: termid, docid = docid, termid # -1 because matrix market indexes are 1-based => convert to 0-based docid, termid, val = int(docid) - 1, int(termid) - 1, float( val) assert previd <= docid, "matrix columns must come in ascending order" if docid != previd: # change of document: return the document read so far (its id is prevId) if previd >= 0: yield previd, document # noqa:F821 # return implicit (empty) documents between previous id and new id # too, to keep consistent document numbering and corpus length for previd in xrange(previd + 1, docid): yield previd, [] # from now on start adding fields to a new document, with a new id previd = docid document = [] document.append(( termid, val, )) # add another field to the current document # handle the last document, as a special case if previd >= 0: yield previd, document # return empty documents between the last explicit document and the number # of documents as specified in the header for previd in xrange(previd + 1, self.num_docs): yield previd, []
def accuracy(self, questions, case_insensitive=False, predictor_method=0): correct = 0 incorrect = 0 sections, section = [], None for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} else: if not section: raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) try: if case_insensitive: a, b, c, expected = [word.upper() for word in line.split()] else: a, b, c, expected = [word for word in line.split()] except ValueError: #print("skipping invalid line #%i in %s", line_no, questions) continue predicted = [None, None] if predictor_method == 0: #print("Evaluation method: Majority vote") predicted = simple_ensamble.predict_majority_vote(self, positive_word_list=[b, c], negative_word_list=[a], top_n_words=1) elif predictor_method == 1: #print("Evaluation method: Sumed most probable") predicted = simple_ensamble.predict_sum_proberbility(self, positive_word_list=[b, c], negative_word_list=[a], top_n_words=1) elif predictor_method == 2: #print("Evaluation method: weighted sum porberbilities") predicted = simple_ensamble.predict_weighted_sum_proberbility(self, positive_word_list=[b, c], negative_word_list=[a], top_n_words=1) else: raise ValueError("incorrect argument type for predictor_method") if predicted[0] == expected: correct += 1 section['correct'].append((a, b, c, expected)) else: incorrect +=1 section['incorrect'].append((a, b, c, expected)) if section: # store the last section, too sections.append(section) print(correct) print(incorrect) #sections.append(total) return sections
def load_from_text(fname): """Load a previously stored :class:`~gensim.corpora.dictionary.Dictionary` from a text file. Mirror function to :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. Parameters ---------- fname: str Path to file produced by :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text`. See Also -------- :meth:`~gensim.corpora.dictionary.Dictionary.save_as_text` Examples -------- >>> from gensim.corpora import Dictionary >>> from gensim.test.utils import get_tmpfile >>> >>> tmp_fname = get_tmpfile("dictionary") >>> corpus = [["máma", "mele", "maso"], ["ema", "má", "máma"]] >>> >>> dct = Dictionary(corpus) >>> dct.save_as_text(tmp_fname) >>> >>> loaded_dct = Dictionary.load_from_text("testdata") >>> assert dct.token2id == loaded_dct.token2id """ result = Dictionary() with utils.smart_open(fname) as f: for lineno, line in enumerate(f): line = utils.to_unicode(line) if lineno == 0: if line.strip().isdigit(): # Older versions of save_as_text may not write num_docs on first line. result.num_docs = int(line.strip()) continue else: logging.warning( "Text does not contain num_docs on the first line." ) try: wordid, word, docfreq = line[:-1].split('\t') except Exception: raise ValueError("invalid line in dictionary file %s: %s" % (fname, line.strip())) wordid = int(wordid) if word in result.token2id: raise KeyError( 'token %s is defined as ID %d and as ID %d' % (word, wordid, result.token2id[word])) result.token2id[word] = wordid result.dfs[wordid] = int(docfreq) return result
def __init__(self, input, transposed=True): """ Parameters ---------- input : {str, file-like object} Path to input file or file-like object (in Matrix Market format). transposed : bool, optional "Orientation" of document. By default, documents should be rows of the matrix, otherwise, needed to set this to False """ logger.info("initializing corpus reader from %s", input) self.input, self.transposed = input, transposed with utils.file_or_filename(self.input) as lines: try: header = utils.to_unicode(next(lines)).strip() if not header.lower().startswith( '%%matrixmarket matrix coordinate real general'): raise ValueError( "File %s not in Matrix Market format with coordinate real general; instead found: \n%s" % (self.input, header)) except StopIteration: pass self.num_docs = self.num_terms = self.num_nnz = 0 for lineno, line in enumerate(lines): line = utils.to_unicode(line) if not line.startswith('%'): self.num_docs, self.num_terms, self.num_nnz = ( int(x) for x in line.split()) if not self.transposed: self.num_docs, self.num_terms = self.num_terms, self.num_docs break logger.info( "accepted corpus with %i documents, %i features, %i non-zero entries", self.num_docs, self.num_terms, self.num_nnz)
def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=False, dummy4unknown=False, similarity_model_type="0"): similarity_gold = [] similarity_model = [] for line_no, line in enumerate(utils.smart_open(pairs)): line = utils.to_unicode(line) if line.startswith('#'): # May be a comment continue else: try: if case_insensitive: a, b, sim = [word.lower() for word in line.split(delimiter)] else: a, b, sim = [word for word in line.split(delimiter)] sim = float(sim) except (ValueError, TypeError): # logger.info('skipping invalid line #%d in %s', line_no, pairs) continue similarity_gold.append(sim) # Similarity from the dataset if (similarity_model_type == 0): similarity_model.append( simple_ensamble.similarity_avg_proberbility(self, a, b)) # Similarity from the model elif (similarity_model_type == 1): if (self.weight_list == []): raise ValueError("No weights specified for ensamble model") else: similarity_model.append(simple_ensamble.similarity_weighted_avg_proberbility(self, a, b)) else: raise ValueError("incorrect argument type for predictor_method") spearman = stats.spearmanr(similarity_gold, similarity_model) pearson = stats.pearsonr(similarity_gold, similarity_model) # logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1]) # logger.debug( # 'Spearman rank-order correlation coefficient against %s: %f with p-value %f', # pairs, spearman[0], spearman[1] # ) # logger.debug('Pairs with unknown words: %d', oov) # self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs) print(pearson) print(spearman) return pearson, spearman
def get_human_similarities_results(self, test_set): dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_set = dir_path + '/Code/TestingSet/' + test_set results = [] for line_no, line in enumerate(utils.smart_open(test_set)): line = utils.to_unicode(line) if line.startswith('#'): # May be a comment continue else: try: a, b, sim = [word.upper() for word in line.split('\t')] sim = float(sim) except (ValueError, TypeError): logger.info('skipping invalid line #%d in %s', line_no, test_set) continue results.append(sim) # Similarity from the dataset return results
def get_expected_acc_results(self, questions): dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) questions = dir_path + '/Code/TestingSet/' + questions """ Returns a list of the expected results from an accuracy test """ results = [] for line_no, line in enumerate(utils.smart_open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed line = utils.to_unicode(line) if line.startswith(': '): continue else: try: a, b, c, expected = [word.upper() for word in line.split()] except ValueError: logger.info("skipping invalid line #%i in %s", line_no, questions) continue results.append(expected) return results
def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', limit=None, datatype=REAL): """Load the input-hidden weight matrix from the original C word2vec-tool format. Note that the information stored in the file is incomplete (the binary tree is missing), so while you can query for word similarity etc., you cannot continue training with a model loaded this way. Parameters ---------- fname : str The file path to the saved word2vec-format file. fvocab : str Optional file path to the vocabulary.Word counts are read from `fvocab` filename, if set (this is the file generated by `-save-vocab` flag of the original C tool). binary : bool If True, indicates whether the data is in binary word2vec format. encoding : str If you trained the C model using non-utf8 encoding for words, specify that encoding in `encoding`. unicode_errors : str default 'strict', is a string suitable to be passed as the `errors` argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source file may include word tokens truncated in the middle of a multibyte unicode character (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. limit : int Sets a maximum number of word-vectors to read from the file. The default, None, means read all. datatype : :class: `numpy.float*` (Experimental) Can coerce dimensions to a non-default float type (such as np.float16) to save memory. (Such types may result in much slower bulk operations or incompatibility with optimized routines.) Returns ------- :obj: `cls` Returns the loaded model as an instance of :class: `cls`. """ from gensim.models.keyedvectors import Vocab counts = None if fvocab is not None: logger.info("loading word counts from %s", fvocab) counts = {} with utils.smart_open(fvocab) as fin: for line in fin: word, count = utils.to_unicode(line).strip().split() counts[word] = int(count) logger.info("loading projection weights from %s", fname) with utils.smart_open(fname) as fin: header = utils.to_unicode(fin.readline(), encoding=encoding) vocab_size, vector_size = (int(x) for x in header.split() ) # throws for invalid file format if limit: vocab_size = min(vocab_size, limit) result = cls(vector_size) result.vector_size = vector_size result.vectors = zeros((vocab_size, vector_size), dtype=datatype) def add_word(word, weights): word_id = len(result.vocab) if word in result.vocab: logger.warning( "duplicate word '%s' in %s, ignoring all but first", word, fname) return if counts is None: # most common scenario: no vocab file given. just make up some bogus counts, in descending order result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) elif word in counts: # use count from the vocab file result.vocab[word] = Vocab(index=word_id, count=counts[word]) else: # vocab file given, but word is missing -- set count to None (TODO: or raise?) logger.warning( "vocabulary file is incomplete: '%s' is missing", word) result.vocab[word] = Vocab(index=word_id, count=None) result.vectors[word_id] = weights result.index2word.append(word) if binary: binary_len = dtype(REAL).itemsize * vector_size for _ in xrange(vocab_size): # mixed text and binary: read text first, then binary word = [] while True: ch = fin.read(1) if ch == b' ': break if ch == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) if ch != b'\n': # ignore newlines in front of words (some binary files have) word.append(ch) word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) weights = fromstring(fin.read(binary_len), dtype=REAL) add_word(word, weights) else: for line_no in xrange(vocab_size): line = fin.readline() if line == b'': raise EOFError( "unexpected end of input; is count incorrect or file otherwise damaged?" ) parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") if len(parts) != vector_size + 1: raise ValueError( "invalid vector on line %s (is this really the text format?)" % line_no) word, weights = parts[0], [REAL(x) for x in parts[1:]] add_word(word, weights) if result.vectors.shape[0] != len(result.vocab): logger.info( "duplicate words detected, shrinking matrix size from %i to %i", result.vectors.shape[0], len(result.vocab)) result.vectors = ascontiguousarray(result.vectors[:len(result.vocab)]) assert (len(result.vocab), vector_size) == result.vectors.shape logger.info("loaded %s matrix from %s", result.vectors.shape, fname) return result
def from_corpus(corpus, id2word=None): """Create :class:`~gensim.corpora.dictionary.Dictionary` from an existing corpus. Parameters ---------- corpus : iterable of iterable of (int, number) Corpus in BoW format. id2word : dict of (int, object) Mapping id -> word. If None, the mapping `id2word[word_id] = str(word_id)` will be used. Notes ----- This can be useful if you only have a term-document BOW matrix (represented by `corpus`), but not the original text corpus. This method will scan the term-document count matrix for all word ids that appear in it, then construct :class:`~gensim.corpora.dictionary.Dictionary` which maps each `word_id -> id2word[word_id]`. `id2word` is an optional dictionary that maps the `word_id` to a token. In case `id2word` isn't specified the mapping `id2word[word_id] = str(word_id)` will be used. Returns ------- :class:`~gensim.corpora.dictionary.Dictionary` Inferred dictionary from corpus. Examples -------- >>> from gensim.corpora import Dictionary >>> >>> corpus = [[(1, 1.0)], [], [(0, 5.0), (2, 1.0)], []] >>> dct = Dictionary.from_corpus(corpus) >>> len(dct) 3 """ result = Dictionary() max_id = -1 for docno, document in enumerate(corpus): if docno % 10000 == 0: logger.info("adding document #%i to %s", docno, result) result.num_docs += 1 result.num_nnz += len(document) for wordid, word_freq in document: max_id = max(wordid, max_id) result.num_pos += word_freq result.dfs[wordid] = result.dfs.get(wordid, 0) + 1 if id2word is None: # make sure length(result) == get_max_id(corpus) + 1 result.token2id = {unicode(i): i for i in xrange(max_id + 1)} else: # id=>word mapping given: simply copy it result.token2id = { utils.to_unicode(token): idx for idx, token in iteritems(id2word) } for idx in itervalues(result.token2id): # make sure all token ids have a valid `dfs` entry result.dfs[idx] = result.dfs.get(idx, 0) logger.info("built %s from %i documents (total %i corpus positions)", result, result.num_docs, result.num_pos) return result