def jaccardscore_sents(sent1, sent2, wvmodel, sim_words=lambda vec1, vec2: 1-cosine(vec1, vec2)): tokens1 = tokenize(sent1) tokens2 = tokenize(sent2) tokens1 = filter(lambda w: w in wvmodel, tokens1) tokens2 = filter(lambda w: w in wvmodel, tokens2) allowable1 = [True] * len(tokens1) allowable2 = [True] * len(tokens2) simdict = {(i, j): sim_words(wvmodel[tokens1[i]], wvmodel[tokens2[j]]) for i, j in product(range(len(tokens1)), range(len(tokens2)))} intersection = 0.0 simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True) for idxtuple, sim in simdictitems: i, j = idxtuple if allowable1[i] and allowable2[j]: intersection += sim allowable1[i] = False allowable2[j] = False union = len(tokens1) + len(tokens2) - intersection if union > 0: return intersection / union elif intersection == 0: return 1. else: return np.inf
def shorttext_to_avgvec(shorttext, wvmodel): """ Convert the short text into an averaged embedded vector representation. Given a short sentence, it converts all the tokens into embedded vectors according to the given word-embedding model, sums them up, and normalize the resulting vector. It returns the resulting vector that represents this short sentence. :param shorttext: a short sentence :param wvmodel: word-embedding model :return: an embedded vector that represents the short sentence :type shorttext: str :type wvmodel: gensim.models.keyedvectors.KeyedVectors :rtype: numpy.ndarray """ vec = np.sum( [wvmodel[token] for token in tokenize(shorttext) if token in wvmodel], axis=0) # normalize norm = np.linalg.norm(vec) if norm != 0: vec /= norm return vec
def shorttext_to_avgembedvec(shorttext, wvmodel, vecsize): """ Convert the short text into an averaged embedded vector representation. (deprecated, kept for backward compatibility) Given a short sentence, it converts all the tokens into embedded vectors according to the given word-embedding model, sums them up, and normalize the resulting vector. It returns the resulting vector that represents this short sentence. This function has been deprecated. Please use :func:`shorttext_to_avgvec` instead. :param shorttext: a short sentence :param wvmodel: word-embedding model :param vecsize: length of embedded vector :return: an embedded vector that represents the short sentence :type shorttext: str :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type vecsize: int :rtype: numpy.ndarray """ vec = np.zeros(vecsize) for token in tokenize(shorttext): if token in wvmodel: vec += wvmodel[token] norm = np.linalg.norm(vec) if norm != 0: vec /= np.linalg.norm(vec) return vec
def convert_trainingdata_matrix(self, classdict): """ Convert the training data into format put into the neural networks. Convert the training data into format put into the neural networks. This is called by :func:`~train`. :param classdict: training data :return: a tuple of three, containing a list of class labels, matrix of embedded word vectors, and corresponding outputs :type classdict: dict :rtype: (list, numpy.ndarray, list) """ classlabels = classdict.keys() lblidx_dict = dict(zip(classlabels, range(len(classlabels)))) # tokenize the words, and determine the word length phrases = [] indices = [] for label in classlabels: for shorttext in classdict[label]: shorttext = shorttext if type(shorttext) == str else '' category_bucket = [0] * len(classlabels) category_bucket[lblidx_dict[label]] = 1 indices.append(category_bucket) phrases.append(tokenize(shorttext)) # store embedded vectors train_embedvec = np.zeros(shape=(len(phrases), self.maxlen, self.vecsize)) for i in range(len(phrases)): for j in range(min(self.maxlen, len(phrases[i]))): train_embedvec[i, j] = self.word_to_embedvec(phrases[i][j]) indices = np.array(indices, dtype=np.int) return classlabels, train_embedvec, indices
def test_inaugural(self): # preparing data usprez = shorttext.data.inaugural() docids = sorted(usprez.keys()) usprez = [' '.join(usprez[docid]) for docid in docids] usprezdf = pd.DataFrame({'yrprez': docids, 'speech': usprez}) usprezdf = usprezdf[['yrprez', 'speech']] # preprocesser defined pipeline = [lambda s: re.sub('[^\w\s]', '', s), lambda s: re.sub('[\d]', '', s), lambda s: s.lower(), lambda s: ' '.join([stemword(token) for token in tokenize(s)]) ] txtpreprocessor = shorttext.utils.text_preprocessor(pipeline) # corpus making docids = list(usprezdf['yrprez']) corpus = [txtpreprocessor(speech).split(' ') for speech in usprezdf['speech']] # making DTM dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True) # check results self.assertEqual(len(dtm.dictionary), 5406) self.assertAlmostEqual(dtm.get_token_occurences(stemword('change'))['2009-Obama'], 0.013801565936022027, places=4) numdocs, numtokens = dtm.dtm.shape self.assertEqual(numdocs, 56) self.assertEqual(numtokens, 5406) self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27584786568258396, places=4)
def jaccardscore_sents(sent1, sent2, wvmodel, sim_words=lambda vec1, vec2: 1 - cosine(vec1, vec2)): """ Compute the Jaccard score between sentences based on their word similarities. :param sent1: first sentence :param sent2: second sentence :param wvmodel: word-embeding model :param sim_words: function for calculating the similarities between a pair of word vectors (default: cosine) :return: soft Jaccard score :type sent1: str :type sent2: str :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type sim_words: function :rtype: float """ tokens1 = tokenize(sent1) tokens2 = tokenize(sent2) tokens1 = list(filter(lambda w: w in wvmodel, tokens1)) tokens2 = list(filter(lambda w: w in wvmodel, tokens2)) allowable1 = [True] * len(tokens1) allowable2 = [True] * len(tokens2) simdict = {(i, j): sim_words(wvmodel[tokens1[i]], wvmodel[tokens2[j]]) for i, j in product(range(len(tokens1)), range(len(tokens2)))} intersection = 0.0 simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True) for idxtuple, sim in simdictitems: i, j = idxtuple if allowable1[i] and allowable2[j]: intersection += sim allowable1[i] = False allowable2[j] = False union = len(tokens1) + len(tokens2) - intersection if union > 0: return intersection / union elif intersection == 0: return 1. else: return np.inf
def shorttext_to_matrix(self, shorttext): """ Convert the short text into a matrix with word-embedding representation. Given a short sentence, it converts all the tokens into embedded vectors according to the given word-embedding model, and put them into a matrix. If a word is not in the model, that row will be filled with zero. :param shorttext: a short sentence :return: a matrix of embedded vectors that represent all the tokens in the sentence :type shorttext: str :rtype: numpy.ndarray """ tokens = tokenize(shorttext) matrix = np.zeros((self.maxlen, self.vecsize)) for i in range(min(self.maxlen, len(tokens))): matrix[i] = self.word_to_embedvec(tokens[i]) return matrix
def shorttext_to_embedvec(self, shorttext): """ Convert the short text into an averaged embedded vector representation. Given a short sentence, it converts all the tokens into embedded vectors according to the given word-embedding model, sums them up, and normalize the resulting vector. It returns the resulting vector that represents this short sentence. :param shorttext: a short sentence :return: an embedded vector that represents the short sentence :type shorttext: str :rtype: numpy.ndarray """ vec = np.zeros(self.vecsize) for token in tokenize(shorttext): if token in self.wvmodel: vec += self.wvmodel[token] norm = np.linalg.norm(vec) if norm != 0: vec /= np.linalg.norm(vec) return vec
def train(self, classdict, nb_epochs=500, l2reg=0.01, bias_l2reg=0.01, optimizer='adam'): """ Train the classifier. Given the training data, train the classifier. :param classdict: training data :param nb_epochs: number of epochs (Defauly: 500) :param l2reg: L2 regularization coefficient (Default: 0.01) :param bias_l2reg: L2 regularization coefficient for bias (Default: 0.01) :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam) :return: None :type classdict: dict :type nb_epochs: int :type l2reg: float :type bias_l2reg: float :type optimizer: str """ self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora( classdict, preprocess_and_tokenize=lambda s: tokenize(self.preprocessor(s))) self.index_classlabels() X, y = self.convert_classdict_to_XY(classdict) kmodel = logistic_framework(len(self.dictionary), len(self.classlabels), l2reg=l2reg, bias_l2reg=bias_l2reg, optimizer=optimizer) kmodel.fit(X.toarray(), y.toarray(), epochs=nb_epochs) self.model = kmodel self.trained = True
def shorttext_to_vec(self, shorttext): """ Convert the shorttext into a sparse vector given the dictionary. According to the dictionary (gensim.corpora.Dictionary), convert the given text into a vector representation, according to the occurence of tokens. This function is deprecated and no longer used because it is too slow to run in a loop. But this is used while doing prediction. :param shorttext: short text to be converted. :return: sparse vector of the vector representation :type shorttext: str :rtype: scipy.sparse.dok_matrix """ # too slow, deprecated tokens = tokenize(self.preprocessor(shorttext)) vec = dok_matrix((1, len(self.dictionary))) for token in tokens: if token in self.dictionary.token2id: vec[0, self.dictionary.token2id[token]] = 1.0 return vec[0, :]
def convert_classdict_to_XY(self, classdict): """ Convert the training data into sparse matrices for training. :param classdict: training data :return: a tuple, consisting of sparse matrices for X (training data) and y (the labels of the training data) :type classdict: dict :rtype: tuple """ nb_data = sum([len(classdict[k]) for k in classdict]) X = dok_matrix((nb_data, len(self.dictionary))) y = dok_matrix((nb_data, len(self.labels2idx))) rowid = 0 for label in classdict: if label in self.labels2idx.keys(): for shorttext in classdict[label]: tokens = tokenize(self.preprocessor(shorttext)) for token in tokens: X[rowid, self.dictionary.token2id[token]] += 1.0 y[rowid, self.labels2idx[label]] = 1. rowid += 1 return X, y
import numpy as np from gensim.corpora import Dictionary from sklearn.preprocessing import OneHotEncoder from keras.models import Sequential from keras.layers import LSTM, Activation, Dropout, Dense, TimeDistributed from . import SpellCorrector from .binarize import default_alph, default_specialsignals from shorttext.utils import classification_exceptions as ce from shorttext.utils import tokenize from .binarize import SpellingToConcatCharVecEncoder, SCRNNBinarizer nospace_tokenize = lambda sentence: map( lambda t: t.strip(), filter(lambda t: len(t.strip()) > 0, tokenize(sentence))) class SCRNNSpellCorrector(SpellCorrector): def __init__(self, operation, alph=default_alph, specialsignals=default_specialsignals, concatcharvec_encoder=None, batchsize=1, nb_hiddenunits=650): self.operation = operation self.binarizer = SCRNNBinarizer(alph, specialsignals) self.concatcharvec_encoder = SpellingToConcatCharVecEncoder( alph) if concatcharvec_encoder == None else concatcharvec_encoder self.onehotencoder = OneHotEncoder()
def argument_parser(): parser = argparse.ArgumentParser( description='Converting SQLite Bible to Gensim Corpus') parser.add_argument('sqlite_bible_path', help='path of SQLite bible') parser.add_argument('target_path_prefix', help='prefix of gensim corpus and dictionary') parser.add_argument('--book', action='store_true', default=False, help='books (not chapters) as documents') return parser if __name__ == '__main__': parser = argument_parser() args = parser.parse_args() print 'Read the database' sqlite_bible = bibledocs.get_sqlite3_dbconn(args.sqlite_bible_path) doc_iterator = bibledocs.retrieve_docs_as_biblebooks( sqlite_bible ) if args.book else bibledocs.retrieve_docs_as_biblechapters(sqlite_bible) print 'Build the corpus' doc_label, (dictionary, gensim_corpus) = cpbuilder.build_corpus( doc_iterator, preprocess=lambda s: tokenize(standard_text_preprocessor_1(s))) print 'Save the corpus' io.save_corpus(dictionary, gensim_corpus, args.target_path_prefix) io.save_doclabel(doc_label, args.target_path_prefix + '_doclabels.txt')