def jaccardscore_sents(sent1, sent2, wvmodel, sim_words=lambda vec1, vec2: 1-cosine(vec1, vec2)):
    tokens1 = tokenize(sent1)
    tokens2 = tokenize(sent2)
    tokens1 = filter(lambda w: w in wvmodel, tokens1)
    tokens2 = filter(lambda w: w in wvmodel, tokens2)
    allowable1 = [True] * len(tokens1)
    allowable2 = [True] * len(tokens2)

    simdict = {(i, j): sim_words(wvmodel[tokens1[i]], wvmodel[tokens2[j]])
               for i, j in product(range(len(tokens1)), range(len(tokens2)))}

    intersection = 0.0
    simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True)
    for idxtuple, sim in simdictitems:
        i, j = idxtuple
        if allowable1[i] and allowable2[j]:
            intersection += sim
            allowable1[i] = False
            allowable2[j] = False

    union = len(tokens1) + len(tokens2) - intersection

    if union > 0:
        return intersection / union
    elif intersection == 0:
        return 1.
    else:
        return np.inf
Exemple #2
0
def shorttext_to_avgvec(shorttext, wvmodel):
    """ Convert the short text into an averaged embedded vector representation.

    Given a short sentence, it converts all the tokens into embedded vectors according to
    the given word-embedding model, sums
    them up, and normalize the resulting vector. It returns the resulting vector
    that represents this short sentence.

    :param shorttext: a short sentence
    :param wvmodel: word-embedding model
    :return: an embedded vector that represents the short sentence
    :type shorttext: str
    :type wvmodel: gensim.models.keyedvectors.KeyedVectors
    :rtype: numpy.ndarray
    """
    vec = np.sum(
        [wvmodel[token] for token in tokenize(shorttext) if token in wvmodel],
        axis=0)

    # normalize
    norm = np.linalg.norm(vec)
    if norm != 0:
        vec /= norm

    return vec
Exemple #3
0
def shorttext_to_avgembedvec(shorttext, wvmodel, vecsize):
    """ Convert the short text into an averaged embedded vector representation. (deprecated, kept for backward compatibility)

    Given a short sentence, it converts all the tokens into embedded vectors according to
    the given word-embedding model, sums
    them up, and normalize the resulting vector. It returns the resulting vector
    that represents this short sentence.

    This function has been deprecated. Please use :func:`shorttext_to_avgvec` instead.

    :param shorttext: a short sentence
    :param wvmodel: word-embedding model
    :param vecsize: length of embedded vector
    :return: an embedded vector that represents the short sentence
    :type shorttext: str
    :type wvmodel: gensim.models.keyedvectors.KeyedVectors
    :type vecsize: int
    :rtype: numpy.ndarray
    """
    vec = np.zeros(vecsize)
    for token in tokenize(shorttext):
        if token in wvmodel:
            vec += wvmodel[token]
    norm = np.linalg.norm(vec)
    if norm != 0:
        vec /= np.linalg.norm(vec)
    return vec
    def convert_trainingdata_matrix(self, classdict):
        """ Convert the training data into format put into the neural networks.

        Convert the training data into format put into the neural networks.
        This is called by :func:`~train`.

        :param classdict: training data
        :return: a tuple of three, containing a list of class labels, matrix of embedded word vectors, and corresponding outputs
        :type classdict: dict
        :rtype: (list, numpy.ndarray, list)
        """
        classlabels = classdict.keys()
        lblidx_dict = dict(zip(classlabels, range(len(classlabels))))

        # tokenize the words, and determine the word length
        phrases = []
        indices = []
        for label in classlabels:
            for shorttext in classdict[label]:
                shorttext = shorttext if type(shorttext) == str else ''
                category_bucket = [0] * len(classlabels)
                category_bucket[lblidx_dict[label]] = 1
                indices.append(category_bucket)
                phrases.append(tokenize(shorttext))

        # store embedded vectors
        train_embedvec = np.zeros(shape=(len(phrases), self.maxlen,
                                         self.vecsize))
        for i in range(len(phrases)):
            for j in range(min(self.maxlen, len(phrases[i]))):
                train_embedvec[i, j] = self.word_to_embedvec(phrases[i][j])
        indices = np.array(indices, dtype=np.int)

        return classlabels, train_embedvec, indices
    def test_inaugural(self):
        # preparing data
        usprez = shorttext.data.inaugural()
        docids = sorted(usprez.keys())
        usprez = [' '.join(usprez[docid]) for docid in docids]
        usprezdf = pd.DataFrame({'yrprez': docids, 'speech': usprez})
        usprezdf = usprezdf[['yrprez', 'speech']]

        # preprocesser defined
        pipeline = [lambda s: re.sub('[^\w\s]', '', s),
                    lambda s: re.sub('[\d]', '', s),
                    lambda s: s.lower(),
                    lambda s: ' '.join([stemword(token) for token in tokenize(s)])
                    ]
        txtpreprocessor = shorttext.utils.text_preprocessor(pipeline)

        # corpus making
        docids = list(usprezdf['yrprez'])
        corpus = [txtpreprocessor(speech).split(' ') for speech in usprezdf['speech']]

        # making DTM
        dtm = shorttext.utils.DocumentTermMatrix(corpus, docids=docids, tfidf=True)

        # check results
        self.assertEqual(len(dtm.dictionary), 5406)
        self.assertAlmostEqual(dtm.get_token_occurences(stemword('change'))['2009-Obama'], 0.013801565936022027,
                               places=4)
        numdocs, numtokens = dtm.dtm.shape
        self.assertEqual(numdocs, 56)
        self.assertEqual(numtokens, 5406)
        self.assertAlmostEqual(dtm.get_total_termfreq('government'), 0.27584786568258396,
                               places=4)
def jaccardscore_sents(sent1,
                       sent2,
                       wvmodel,
                       sim_words=lambda vec1, vec2: 1 - cosine(vec1, vec2)):
    """ Compute the Jaccard score between sentences based on their word similarities.

    :param sent1: first sentence
    :param sent2: second sentence
    :param wvmodel: word-embeding model
    :param sim_words: function for calculating the similarities between a pair of word vectors (default: cosine)
    :return: soft Jaccard score
    :type sent1: str
    :type sent2: str
    :type wvmodel: gensim.models.keyedvectors.KeyedVectors
    :type sim_words: function
    :rtype: float
    """
    tokens1 = tokenize(sent1)
    tokens2 = tokenize(sent2)
    tokens1 = list(filter(lambda w: w in wvmodel, tokens1))
    tokens2 = list(filter(lambda w: w in wvmodel, tokens2))
    allowable1 = [True] * len(tokens1)
    allowable2 = [True] * len(tokens2)

    simdict = {(i, j): sim_words(wvmodel[tokens1[i]], wvmodel[tokens2[j]])
               for i, j in product(range(len(tokens1)), range(len(tokens2)))}

    intersection = 0.0
    simdictitems = sorted(simdict.items(), key=lambda s: s[1], reverse=True)
    for idxtuple, sim in simdictitems:
        i, j = idxtuple
        if allowable1[i] and allowable2[j]:
            intersection += sim
            allowable1[i] = False
            allowable2[j] = False

    union = len(tokens1) + len(tokens2) - intersection

    if union > 0:
        return intersection / union
    elif intersection == 0:
        return 1.
    else:
        return np.inf
    def shorttext_to_matrix(self, shorttext):
        """ Convert the short text into a matrix with word-embedding representation.

        Given a short sentence, it converts all the tokens into embedded vectors according to
        the given word-embedding model, and put them into a matrix. If a word is not in the model,
        that row will be filled with zero.

        :param shorttext: a short sentence
        :return: a matrix of embedded vectors that represent all the tokens in the sentence
        :type shorttext: str
        :rtype: numpy.ndarray
        """
        tokens = tokenize(shorttext)
        matrix = np.zeros((self.maxlen, self.vecsize))
        for i in range(min(self.maxlen, len(tokens))):
            matrix[i] = self.word_to_embedvec(tokens[i])
        return matrix
    def shorttext_to_embedvec(self, shorttext):
        """ Convert the short text into an averaged embedded vector representation.

        Given a short sentence, it converts all the tokens into embedded vectors according to
        the given word-embedding model, sums
        them up, and normalize the resulting vector. It returns the resulting vector
        that represents this short sentence.

        :param shorttext: a short sentence
        :return: an embedded vector that represents the short sentence
        :type shorttext: str
        :rtype: numpy.ndarray
        """
        vec = np.zeros(self.vecsize)
        for token in tokenize(shorttext):
            if token in self.wvmodel:
                vec += self.wvmodel[token]
        norm = np.linalg.norm(vec)
        if norm != 0:
            vec /= np.linalg.norm(vec)
        return vec
Exemple #9
0
    def train(self,
              classdict,
              nb_epochs=500,
              l2reg=0.01,
              bias_l2reg=0.01,
              optimizer='adam'):
        """ Train the classifier.

        Given the training data, train the classifier.

        :param classdict: training data
        :param nb_epochs: number of epochs (Defauly: 500)
        :param l2reg: L2 regularization coefficient (Default: 0.01)
        :param bias_l2reg: L2 regularization coefficient for bias (Default: 0.01)
        :param optimizer: optimizer for gradient descent. Options: sgd, rmsprop, adagrad, adadelta, adam, adamax, nadam. (Default: adam)
        :return: None
        :type classdict: dict
        :type nb_epochs: int
        :type l2reg: float
        :type bias_l2reg: float
        :type optimizer: str
        """
        self.dictionary, self.corpus, self.classlabels = gc.generate_gensim_corpora(
            classdict,
            preprocess_and_tokenize=lambda s: tokenize(self.preprocessor(s)))
        self.index_classlabels()

        X, y = self.convert_classdict_to_XY(classdict)

        kmodel = logistic_framework(len(self.dictionary),
                                    len(self.classlabels),
                                    l2reg=l2reg,
                                    bias_l2reg=bias_l2reg,
                                    optimizer=optimizer)
        kmodel.fit(X.toarray(), y.toarray(), epochs=nb_epochs)

        self.model = kmodel
        self.trained = True
Exemple #10
0
    def shorttext_to_vec(self, shorttext):
        """ Convert the shorttext into a sparse vector given the dictionary.

        According to the dictionary (gensim.corpora.Dictionary), convert the given text
        into a vector representation, according to the occurence of tokens.

        This function is deprecated and no longer used because it is too slow to run in a loop.
        But this is used while doing prediction.

        :param shorttext: short text to be converted.
        :return: sparse vector of the vector representation
        :type shorttext: str
        :rtype: scipy.sparse.dok_matrix
        """
        # too slow, deprecated
        tokens = tokenize(self.preprocessor(shorttext))

        vec = dok_matrix((1, len(self.dictionary)))
        for token in tokens:
            if token in self.dictionary.token2id:
                vec[0, self.dictionary.token2id[token]] = 1.0

        return vec[0, :]
Exemple #11
0
    def convert_classdict_to_XY(self, classdict):
        """ Convert the training data into sparse matrices for training.

        :param classdict: training data
        :return: a tuple, consisting of sparse matrices for X (training data) and y (the labels of the training data)
        :type classdict: dict
        :rtype: tuple
        """
        nb_data = sum([len(classdict[k]) for k in classdict])
        X = dok_matrix((nb_data, len(self.dictionary)))
        y = dok_matrix((nb_data, len(self.labels2idx)))

        rowid = 0
        for label in classdict:
            if label in self.labels2idx.keys():
                for shorttext in classdict[label]:
                    tokens = tokenize(self.preprocessor(shorttext))
                    for token in tokens:
                        X[rowid, self.dictionary.token2id[token]] += 1.0
                    y[rowid, self.labels2idx[label]] = 1.
                    rowid += 1

        return X, y
import numpy as np
from gensim.corpora import Dictionary
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import LSTM, Activation, Dropout, Dense, TimeDistributed

from . import SpellCorrector
from .binarize import default_alph, default_specialsignals
from shorttext.utils import classification_exceptions as ce
from shorttext.utils import tokenize
from .binarize import SpellingToConcatCharVecEncoder, SCRNNBinarizer

nospace_tokenize = lambda sentence: map(
    lambda t: t.strip(),
    filter(lambda t: len(t.strip()) > 0, tokenize(sentence)))


class SCRNNSpellCorrector(SpellCorrector):
    def __init__(self,
                 operation,
                 alph=default_alph,
                 specialsignals=default_specialsignals,
                 concatcharvec_encoder=None,
                 batchsize=1,
                 nb_hiddenunits=650):
        self.operation = operation
        self.binarizer = SCRNNBinarizer(alph, specialsignals)
        self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(
            alph) if concatcharvec_encoder == None else concatcharvec_encoder
        self.onehotencoder = OneHotEncoder()

def argument_parser():
    parser = argparse.ArgumentParser(
        description='Converting SQLite Bible to Gensim Corpus')
    parser.add_argument('sqlite_bible_path', help='path of SQLite bible')
    parser.add_argument('target_path_prefix',
                        help='prefix of gensim corpus and dictionary')
    parser.add_argument('--book',
                        action='store_true',
                        default=False,
                        help='books (not chapters) as documents')
    return parser


if __name__ == '__main__':
    parser = argument_parser()
    args = parser.parse_args()

    print 'Read the database'
    sqlite_bible = bibledocs.get_sqlite3_dbconn(args.sqlite_bible_path)
    doc_iterator = bibledocs.retrieve_docs_as_biblebooks(
        sqlite_bible
    ) if args.book else bibledocs.retrieve_docs_as_biblechapters(sqlite_bible)
    print 'Build the corpus'
    doc_label, (dictionary, gensim_corpus) = cpbuilder.build_corpus(
        doc_iterator,
        preprocess=lambda s: tokenize(standard_text_preprocessor_1(s)))
    print 'Save the corpus'
    io.save_corpus(dictionary, gensim_corpus, args.target_path_prefix)
    io.save_doclabel(doc_label, args.target_path_prefix + '_doclabels.txt')