def create_mat(self, list_, row2word=None, col2word=None): if row2word is None: row2word = self.row2word if col2word is None: col2word = self.col2word return IndexMatrix(sp.coo_matrix(list_), row2word, col2word)
def create_unknown(self): """ This is a vector that is orthogonal to all other vectors. This is used for words that are unknown to the model. """ # Create index vector if not exist hsh = hashlib.md5() hsh.update("$UNKNOWN$".encode()) # highest number allowed by seed seed = int(hsh.hexdigest(), 16) % 4294967295 np.random.seed(seed) rand_indices = np.random.permutation( self.config['dimensionality'])[:self.config['num_indices']] pos_indices = rand_indices[rand_indices.size // 2:] neg_indices = rand_indices[:rand_indices.size // 2] unknown_vec = np.zeros((1, self.config['dimensionality'])) unknown_vec[0, pos_indices] = 1 unknown_vec[0, neg_indices] = -1 unknown_vec = IndexMatrix(unknown_vec, ['$UNKNOWN$'], list(range(self.config['dimensionality']))) return unknown_vec / unknown_vec.norm()
def setUp(self): self.spmat = sp.coo_matrix([[2, 5, 3], [0, 1, 9]]) self.row2word = ['a', 'b'] self.col2word = ['furiously', 'makes', 'sense'] self.mat = IndexMatrix(self.spmat, self.row2word, self.col2word)
def __init__(self, config=None): super().__init__(config=config) self.langvectors = IndexMatrix({}) self.unknown_vec = self.create_unknown()
class Eigenvectors(RILangID): """ The idea behind Eigenvectors is that we create a language space by creating random indexing matrices for each language. Each RI matrix is then collapsed into a vector by summing all columns. This vector will be an approximation to the first eigenvector, according to the power iteration matrix. http://en.wikipedia.org/wiki/Power_iteration By doing this for each language, we get a language space of one language vector per language being the first eigenvectors. When identifying a sentence, we create a sentence vector of the sentence for each language. A sentence vector that is very close the eigenvector for the given language should mean that the sentence is very similar to the langugage. As such, the sentence with the smallest distance from its language is the determined language. This doesn't really seem to work very well. """ def __init__(self, config=None): super().__init__(config=config) self.langvectors = IndexMatrix({}) self.unknown_vec = self.create_unknown() def create_unknown(self): """ This is a vector that is orthogonal to all other vectors. This is used for words that are unknown to the model. """ # Create index vector if not exist hsh = hashlib.md5() hsh.update("$UNKNOWN$".encode()) # highest number allowed by seed seed = int(hsh.hexdigest(), 16) % 4294967295 np.random.seed(seed) rand_indices = np.random.permutation( self.config['dimensionality'])[:self.config['num_indices']] pos_indices = rand_indices[rand_indices.size // 2:] neg_indices = rand_indices[:rand_indices.size // 2] unknown_vec = np.zeros((1, self.config['dimensionality'])) unknown_vec[0, pos_indices] = 1 unknown_vec[0, neg_indices] = -1 unknown_vec = IndexMatrix(unknown_vec, ['$UNKNOWN$'], list(range(self.config['dimensionality']))) return unknown_vec / unknown_vec.norm() def identify(self, sentence): """ Create a sentence vector for each language. When a word is unknown for the given language, it is treated as $UNKNOWN$. """ words = sentence.split(" ") best_lang = None best_score = 0 assure_consistency = self.config.get('assure_consistency', False) for language, mat in self.matrix.items(): distance = 0 for w in words: if w in mat.row2word: wordvec = mat[w] distance += abs(pydsm.similarity.cos(wordvec, self.langvectors[language], assure_consistency=assure_consistency)[0, 0]) if distance > best_score: best_lang = language best_score = distance return best_lang def train(self, corpora): """ Train the model according to the class documentation. """ self.matrix = {} for language, corpus in corpora.items(): print("Reading {}...".format(language)) self.matrix[language] = self.build(corpus) langmodel = self.matrix[language].sum(axis=0) langmodel.row2word = [language] self.langvectors = self.langvectors.merge(langmodel) def build(self, text): """ Create a random indexing space for each language. """ model = RandomIndexing(corpus=text, config=self.config).matrix return model