Esempio n. 1
0
def buildnnvectorspace(sentencetuples, searchobject):
    """

	find the words
	find the morphology objects you need for those words
	build vectors

	:return:
	"""

    wordbundler = False

    activepoll = searchobject.poll

    # find all words in use
    try:
        thesentences = [s[1] for s in sentencetuples]
    except TypeError:
        # no sentences were passed: a possibility in the new code that should get factored away eventually
        return 'failed to build model'

    allwords = findsetofallwords(thesentences)

    # find all possible forms of all the words we used
    # consider subtracting some set like: rarewordsthatpretendtobecommon = {}
    wl = '{:,}'.format(len(thesentences))
    activepoll.statusis(
        'No stored model for this search. Generating a new one.<br />Finding headwords for {n} sentences'
        .format(n=wl))

    morphdict = getrequiredmorphobjects(allwords, furtherdeabbreviate=True)

    # associate each word with its possible headwords
    morphdict = convertmophdicttodict(morphdict)

    # import re
    # teststring = r'aesa'
    # kvpairs = [(k,morphdict[k]) for k in morphdict.keys() if re.search(teststring, k)]
    # print('convertmophdicttodict', kvpairs)
    # sample selection:
    # convertmophdicttodict [('caesare', {'caesar'}), ('caesarisque', {'caesar'}), ('caesari', {'caesar'}), ('caesar', {'caesar'}), ('caesa', {'caesum', 'caesa', 'caesus¹', 'caedo'}), ('caesaremque', {'caesar'}), ('caesaris', {'caesar'}), ('caesarem', {'caesar'})]

    if wordbundler:
        morphdict = {t: '·'.join(morphdict[t]) for t in morphdict}

    activepoll.statusis(
        'No stored model for this search. Generating a new one.<br />Building vectors for the headwords in the {n} sentences'
        .format(n=wl))
    vectorspace = buildgensimmodel(searchobject, morphdict, thesentences)

    return vectorspace
def gensimexperiment(so):
	"""

	:param activepoll:
	:param so:
	:return:
	"""

	activepoll = so.poll

	activecorpora = so.getactivecorpora()
	searchlist = flagexclusions(searchlist, so.session)
	workssearched = len(searchlist)
	searchlist = compilesearchlist(listmapper, so.session)
	searchlist = calculatewholeauthorsearches(searchlist, authordict)
	so.searchlist = searchlist
	sentencetuples = vectorprepdispatcher(so)
	# find all words in use
	listsofwords = [s[1] for s in sentencetuples]
	allwords = findsetofallwords(listsofwords)

	# find all possible forms of all the words we used
	# consider subtracting some set like: rarewordsthatpretendtobecommon = {}
	wl = '{:,}'.format(len(listsofwords))
	activepoll.statusis('Finding headwords for {n} sentences'.format(n=wl))

	morphdict = getrequiredmorphobjects(allwords)
	morphdict = convertmophdicttodict(morphdict)

	# find all possible headwords of all of the forms in use
	# note that we will not know what we did not know: count unparsed words too and deliver that as info at the end?
	allheadwords = dict()
	for m in morphdict.keys():
		for h in morphdict[m]:
			allheadwords[h] = m

	vectorspace = logentropybuildspace(so, morphdict, listsofwords)

	return output
Esempio n. 3
0
def buildlemmatizesearchphrase(phrase: str) -> str:
    """

	turn a search into a collection of headwords

	:param phrase:
	:return:
	"""

    # phrase = 'vias urbis munera'
    phrase = phrase.strip()
    words = phrase.split(' ')

    morphdict = getrequiredmorphobjects(words, furtherdeabbreviate=True)
    morphdict = convertmophdicttodict(morphdict)
    # morphdict {'munera': {'munero', 'munus'}, 'urbis': {'urbs'}, 'uias': {'via', 'vio'}}

    listoflistofheadwords = buildflatbagsofwords(morphdict, [words])
    # [['via', 'vio', 'urbs', 'munero', 'munus']]

    lemmatizesearchphrase = ' '.join(listoflistofheadwords[0])
    # lemmatizesearchphrase = 'via vio urbs munus munero'

    return lemmatizesearchphrase
def ldatopicgraphing(sentencetuples,
                     workssearched,
                     searchobject,
                     headwordstops=True):
    """

	a sentence tuple looks like:
		('gr2397w001_ln_42', 'ποίῳ δὴ τούτων ἄξιον τὸν κόϲμον φθείρεϲθαι φάναι')

	see:
		http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

	see also:

		https://nlpforhackers.io/topic-modeling/

	CountVectorizer:
	max_df : float in range [0.0, 1.0] or int, default=1.0
	    When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).

	min_df : float in range [0.0, 1.0] or int, default=1
	    When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.


	see:
		https://stackoverflow.com/questions/27697766/understanding-min-df-and-max-df-in-scikit-countvectorizer#35615151

	max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:

		max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
		max_df = 25 means "ignore terms that appear in more than 25 documents".

	The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

	min_df is used for removing terms that appear too infrequently. For example:

		min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
		min_df = 5 means "ignore terms that appear in less than 5 documents".

	The default min_df is 1, which means "ignore terms that appear in less than 1 document". Thus, the default setting does not ignore any terms.

	notes:
		maxfreq of 1 will give you a lot of excessively common words: 'this', 'that', etc.
		maxfreq of

	on the general issue of graphing see also:
		https://speakerdeck.com/bmabey/visualizing-topic-models
		https://de.dariah.eu/tatom/topic_model_visualization.html

	on the axes:
		https://stats.stackexchange.com/questions/222/what-are-principal-component-scores

	:param sentencetuples:
	:param activepoll:
	:return:
	"""

    if headwordstops:
        stops = mostcommonwordsviaheadwords()
    else:
        stops = mostcommoninflectedforms()

    sentencetuples = [(a, removestopwords(b, stops))
                      for a, b in sentencetuples]

    activepoll = searchobject.poll
    vv = searchobject.vectorvalues

    settings = {
        'maxfeatures': vv.ldamaxfeatures,
        'components': vv.ldacomponents,  # topics
        'maxfreq': vv.
        ldamaxfreq,  # fewer than n% of sentences should have this word (i.e., purge common words)
        'minfreq': vv.ldaminfreq,  # word must be found >n times
        'iterations': vv.ldaiterations,
        'mustbelongerthan': vv.ldamustbelongerthan
    }

    # not easy to store/fetch since you need both ldavectorizer and ldamodel
    # so we just store the actual graph...
    ldavishtmlandjs = checkforstoredvector(searchobject, 'lda')

    if not ldavishtmlandjs:
        sentencetuples = [
            s for s in sentencetuples
            if len(s[1].strip().split(' ')) > settings['mustbelongerthan']
        ]
        sentences = [s[1] for s in sentencetuples]

        sentencesaslists = [s.split(' ') for s in sentences]
        allwordsinorder = [
            item for sublist in sentencesaslists for item in sublist if item
        ]

        activepoll.statusis('Finding all headwords')
        morphdict = getrequiredmorphobjects(set(allwordsinorder),
                                            furtherdeabbreviate=True)
        morphdict = convertmophdicttodict(morphdict)

        bagsofwordlists = buildwordbags(searchobject, morphdict,
                                        sentencesaslists)
        bagsofsentences = [' '.join(b) for b in bagsofwordlists]

        # print('bagsofsentences[:3]', bagsofsentences[3:])

        activepoll.statusis('Running the LDA vectorizer')
        # Use tf (raw term count) features for LDA.
        ldavectorizer = CountVectorizer(max_df=settings['maxfreq'],
                                        min_df=settings['minfreq'],
                                        max_features=settings['maxfeatures'])

        ldavectorized = ldavectorizer.fit_transform(bagsofsentences)

        ldamodel = LatentDirichletAllocation(
            n_components=settings['components'],
            max_iter=settings['iterations'],
            learning_method='online',
            learning_offset=50.,
            random_state=0)

        ldamodel.fit(ldavectorized)

        visualisation = ldavis.prepare(ldamodel, ldavectorized, ldavectorizer)
        # pyLDAvis.save_html(visualisation, 'ldavis.html')

        ldavishtmlandjs = pyLDAvis.prepared_data_to_html(visualisation)
        storevectorindatabase(searchobject, 'lda', ldavishtmlandjs)

    jsonoutput = ldatopicsgenerateoutput(ldavishtmlandjs, searchobject)

    return jsonoutput
def tfnlptraining(sentences, searchobject):
    """


	:param sentences:
	:param activepoll:
	:return:
	"""

    activepoll = searchobject.poll

    sentencesaslists = [s[1].split(' ') for s in sentences]
    allwordsinorder = [
        item for sublist in sentencesaslists for item in sublist if item
    ]

    setofallwords = set(allwordsinorder)
    morphdict = getrequiredmorphobjects(setofallwords)
    morphdict = convertmophdicttodict(morphdict)

    setofallheadwords = list()
    joined = True
    if not joined:
        # FLATTENED lemmata
        for w in setofallwords:
            # note that we are warping the shape of the sentences by doing this
            # an alternative is to '·'.join() the items
            try:
                setofallheadwords.extend([w for w in morphdict[w]])
            except KeyError:
                pass
    else:
        # JOINED lemmata
        for w in setofallwords:
            # note that we are warping the shape of the sentences by doing this
            # an alternative is to '·'.join() the items
            try:
                setofallheadwords.append('·'.join(morphdict[w]))
            except KeyError:
                pass

    vocabularysize = min(10000, len(setofallheadwords))

    activepoll.statusis('Constructing dataset')
    dataset = builddatasetdict(setofallheadwords, vocabularysize)

    integermorphdict = dict()
    for wordform in morphdict:
        try:
            integermorphdict[wordform] = {
                dataset['wordsmappedtocodes'][w]
                for w in morphdict[wordform]
            }
        except KeyError:
            pass

    # integermorphdict looks like: {'lepidum': {33, 34}, 'habe': {32}, 'tum': {31}, ...}

    activepoll.statusis('Converting sentences to lists of integers')
    textasvals = converttexttoinexvalues(sentencesaslists, integermorphdict)
    textasvals = [t for t in textasvals if t]

    activepoll.statusis('Starting tensorflow work')
    similarities = tfnlpwork(textasvals, dataset['wordsmappedtocodes'],
                             dataset['codesmappedtowords'], activepoll)

    print('similarities', similarities)

    return
def tftrainondata(sentences, searchobject):
    """

	adapted from the tensorflow tutorial

	sentences = ['the first sentence', 'the next sentence', ...]

	:param sentences:
	:return:
	"""

    activepoll = searchobject.poll

    sentencesaslists = [s.split(' ') for s in sentences]
    allwordsinorder = [
        item for sublist in sentencesaslists for item in sublist if item
    ]

    morphdict = getrequiredmorphobjects(set(allwordsinorder))
    morphdict = convertmophdicttodict(morphdict)

    headwordsinorder = list()
    for w in allwordsinorder:
        try:
            hwds = [item for item in morphdict[w]]
            headwordsinorder.append('·'.join(hwds))
        except TypeError:
            pass
        except KeyError:
            pass

    vocabularysize = min(10000, len(set(headwordsinorder)))

    activepoll.statusis('Constructing dataset')
    dataset = builddatasetdict(headwordsinorder, vocabularysize)

    batchsize = 8
    skipwindow = 1
    numberofskips = 2

    # print('dataset', dataset)
    activepoll.statusis('Constructing training batch')
    trainingbatch = tfgeneratetrainingbatch(batchsize, numberofskips,
                                            skipwindow, dataset['listofcodes'],
                                            0)

    batchsize = 128
    embeddingsize = 128
    numsampled = 64

    # We pick a random validation set to sample nearest neighbors. Here we limit the
    # validation samples to the words that have a low numeric ID, which by
    # construction are also the most frequent. These 3 variables are used only for
    # displaying model accuracy, they don't affect calculation.

    validsize = 16
    validwindow = 100
    validexamples = np.random.choice(validwindow, validsize, replace=False)

    graph = tf.Graph()

    with graph.as_default():
        # Input data.
        traininputs = tf.placeholder(tf.int32, shape=[batchsize])
        trainlabels = tf.placeholder(tf.int32, shape=[batchsize, 1])
        validdataset = tf.constant(validexamples, dtype=tf.int32)

        # Ops and variables pinned to the CPU because of missing GPU implementation
        with tf.device('/cpu:0'):
            # Look up embeddings for inputs.
            embeddings = tf.Variable(
                tf.random_uniform([vocabularysize, embeddingsize], -1.0, 1.0))
            embed = tf.nn.embedding_lookup(embeddings, traininputs)

            # Construct the variables for the NCE loss
            nce_weights = tf.Variable(
                tf.truncated_normal([vocabularysize, embeddingsize],
                                    stddev=1.0 / math.sqrt(embeddingsize)))
            nce_biases = tf.Variable(tf.zeros([vocabularysize]))

        # Compute the average NCE loss for the batch.
        # tf.nce_loss automatically draws a new sample of the negative labels each
        # time we evaluate the loss.
        # Explanation of the meaning of NCE loss:
        #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
        loss = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=trainlabels,
                           inputs=embed,
                           num_sampled=numsampled,
                           num_classes=vocabularysize))

        # Construct the SGD optimizer using a learning rate of 1.0.
        optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)

        # Compute the cosine similarity between minibatch examples and all embeddings.
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
        normalizedembeddings = embeddings / norm
        validembeddings = tf.nn.embedding_lookup(normalizedembeddings,
                                                 validdataset)
        similarity = tf.matmul(validembeddings,
                               normalizedembeddings,
                               transpose_b=True)

        # Add variable initializer.
        init = tf.global_variables_initializer()

    # Step 5: Begin training.
    numsteps = 100001
    # numsteps = 50001

    activepoll.statusis('Training on the data')
    with tf.Session(graph=graph) as tensorflowsession:
        # We must initialize all variables before we use them.
        init.run()
        thedata = trainingbatch['data']
        theindex = trainingbatch['dataindex']
        averageloss = 0
        for step in range(numsteps):
            newbatch = tfgeneratetrainingbatch(batchsize, numberofskips,
                                               skipwindow, thedata, theindex)
            thedata = newbatch['data']
            theindex = newbatch['dataindex']
            feeddict = {
                traininputs: newbatch['batch'],
                trainlabels: newbatch['labels']
            }

            # We perform one update step by evaluating the optimizer op (including it
            # in the list of returned values for session.run()
            _, lossval = tensorflowsession.run([optimizer, loss],
                                               feed_dict=feeddict)
            averageloss += lossval

            if step % 10000 == 0:
                activepoll.statusis('At step {s} of {n} training runs'.format(
                    s=step, n=numsteps))

            # if step % 2000 == 0:
            # 	if step > 0:
            # 		averageloss /= 2000
            # 	# The average loss is an estimate of the loss over the last 2000 batches.
            # 	print('Average loss at step ', step, ': ', averageloss)
            # 	averageloss = 0
            #
            # # print('codesmappedtowords', dataset['codesmappedtowords'])
            # # Note that this is expensive (~20% slowdown if computed every 500 steps)
            # if step % 10000 == 0:
            # 	sim = similarity.eval()
            # 	for i in range(max(validsize, len(dataset['codesmappedtowords'].keys()))):
            # 		try:
            # 			valid_word = dataset['codesmappedtowords'][validexamples[i]]
            # 		except IndexError as x:
            # 			print('ve', validexamples)
            # 			print(x)
            # 		top_k = 8  # number of nearest neighbors
            # 		nearest = (-sim[i, :]).argsort()[1:top_k + 1]
            # 		log_str = 'Nearest to %s:' % valid_word
            # 		for k in range(top_k):
            # 			close_word = dataset['codesmappedtowords'][nearest[k]]
            # 			log_str = '%s %s,' % (log_str, close_word)
            # 		print(log_str)

        finalembeddings = normalizedembeddings.eval()

    activepoll.statusis('Graphing the data')
    tsne = TSNE(perplexity=30,
                n_components=2,
                init='pca',
                n_iter=5000,
                method='exact')
    plotonly = min(500, vocabularysize)
    lowdimembs = tsne.fit_transform(finalembeddings[:plotonly, :])
    labels = [dataset['codesmappedtowords'][i] for i in range(plotonly)]
    tfplotwithlabels(lowdimembs, labels, os.path.join('', 'testplot.png'))

    return
Esempio n. 7
0
def acquireandbagthewords(so: SearchObject) -> List[List[str]]:
    """

    [3] acquire and bag the words
        [a] grab db lines that are relevant to the search
        [b] turn them into a unified text block
        [c] do some preliminary cleanups
        [d1] break the text into sentences (NB: these are "unlemmatized bags of words")
        [d2] [disabled option] and assemble sentences-with-locus
        [e] figure out all of the words used in the passage
        [f] find all of the parsing info relative to these words
        [g] figure out which headwords to associate with the collection of words
        [h] build the lemmatized bags of words ('unlemmatized' can skip [e] - [g]...)

    """

    # [a] grab db lines that are relevant to the search
    so.poll.statusis('Grabbing the required lines')
    linesweneed = basicprecomposedsqlsearcher(so)
    so.poll.allworkis(-1)  # this turns off the % completed notice in the JS
    so.poll.sethits(0)

    # return from an SPop will leave them out of order...
    # dbWorkLine has __eq__, __gt__, and __lt__
    so.poll.statusis('Sorting the lines')
    linesweneed = sorted(linesweneed)

    # kill off titles and salutations: dangerous if there is a body l1 value of 't' out there
    so.poll.statusis('Pruning the lines')
    linesweneed = [r for r in linesweneed if r.l1 not in ['t', 'sa']]

    # [b] turn them into a unified text block
    # note that we will shortly discard the getlineurl() info ...
    so.poll.statusis('Joining the lines')
    wholetext = ' '.join([
        '⊏{i}⊐{t}'.format(i=l.getlineurl(), t=l.markedup) for l in linesweneed
    ])

    # [c] do some preliminary cleanups
    so.poll.statusis('Cleaning the lines')
    wholetext = re.sub(r'-\s{1,2}', str(), wholetext)
    wholetext = cleanvectortext(
        wholetext)  # this contains a de-abbreviator, html stripper, etc.
    wholetext = wholetext.lower()

    # [d1] break the text into sentences
    so.poll.statusis('Finding the sentences')
    terminations = ['.', '?', '!', '·', ';']
    allsentences = recursivesplit([wholetext], terminations)

    # do a little bit of extra cleaning that we could not do before
    punct = re.compile('[{s}]'.format(s=re.escape(punctuation +
                                                  elidedextrapunct)))
    allsentences = [re.sub(punct, str(), s) for s in allsentences]

    if so.sentencebundlesize > 1:
        # https://stackoverflow.com/questions/44104729/grouping-every-three-items-together-in-list-python
        allsentences = [
            ' '.join(bundle)
            for bundle in zip(*[iter(allsentences)] * so.sentencebundlesize)
        ]

    # [d2] [disabled option] and assemble sentences-with-locus (NB: these are "unlemmatized bags of words")
    bll = False
    if bll:
        unusedlistofdicts = buildlineandlocus(linesweneed[0], allsentences)
        del unusedlistofdicts

    # we might be using a lot of memory...
    del linesweneed

    # clean out the location info
    allsentences = [re.sub(r'⊏.*?⊐', str(), s) for s in allsentences]
    # consolewarning('trimming sentences: remove next line of code later')
    # allsentences = allsentences[:20]

    # consolewarning('acquireandbagthewords(): {s} sentences found'.format(s=len(allsentences)), color='red')

    morphdict = dict()
    if so.session['baggingmethod'] != 'unlemmatized':
        so.poll.statusis('Determining the set of words')
        # [e] figure out all of the words used in the passage
        allwords = findsetofallwords(allsentences)

        # [f] find all of the parsing info relative to these words
        so.poll.statusis('Building the parsing table')
        mo = getrequiredmorphobjects(allwords, furtherdeabbreviate=True)

        # [g] figure out which headwords to associate with the collection of words
        # {'θεῶν': {'θεόϲ', 'θέα', 'θεάω', 'θεά'}, 'πώ': {'πω'}, 'πολλά': {'πολύϲ'}, 'πατήρ': {'πατήρ'}, ... }
        morphdict = convertmophdicttodict(mo)

    # [h] build the lemmatized bags of words ('unlemmatized' can skip [e]-[g]...)
    wordbags = pythonpipelinewordbagbuilder(so, morphdict, allsentences)

    return wordbags
Esempio n. 8
0
def ldatopicmodeling(sentencetuples, searchobject):
    """

	see:
		http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py

	CountVectorizer:
	max_df : float in range [0.0, 1.0] or int, default=1.0
		When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).

	min_df : float in range [0.0, 1.0] or int, default=1
		When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature.

	see sample results at end of file

	:param sentencetuples:
	:param activepoll:
	:return:
	"""

    maxfeatures = 2000
    components = 15
    topwords = 15

    maxfreq = .60
    minfreq = 5
    iterations = 12

    mustbelongerthan = 2

    sentencetuples = [
        s for s in sentencetuples
        if len(s[1].strip().split(' ')) > mustbelongerthan
    ]
    sentences = [s[1] for s in sentencetuples]

    sentences = [s.split(' ') for s in sentences]
    allwordsinorder = [
        item for sublist in sentences for item in sublist if item
    ]

    morphdict = getrequiredmorphobjects(set(allwordsinorder))
    morphdict = convertmophdicttodict(morphdict)

    bagsofwords = buildwordbags(searchobject, morphdict, sentences)

    bagsofsentences = [' '.join(b) for b in bagsofwords]

    # Use tf (raw term count) features for LDA.
    ldavectorizer = CountVectorizer(max_df=maxfreq,
                                    min_df=minfreq,
                                    max_features=maxfeatures)

    ldavectorized = ldavectorizer.fit_transform(bagsofsentences)

    lda = LatentDirichletAllocation(n_components=components,
                                    max_iter=iterations,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)

    lda.fit(ldavectorized)

    print("\nTopics in LDA model:")
    tf_feature_names = ldavectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, topwords)

    # Use tf-idf features for NMF.
    tfidfvectorizer = TfidfVectorizer(max_df=0.95,
                                      min_df=2,
                                      max_features=maxfeatures)

    tfidf = tfidfvectorizer.fit_transform(bagsofsentences)

    # Fit the NMF model
    nmf = NMF(n_components=components, random_state=1, alpha=.1,
              l1_ratio=.5).fit(tfidf)

    print("\nTopics in NMF model (Frobenius norm):")
    tfidffeaturenames = tfidfvectorizer.get_feature_names()
    print_top_words(nmf, tfidffeaturenames, topwords)

    # Fit the NMF model
    print(
        "Fitting the NMF model (generalized Kullback-Leibler divergence) with "
        "tf-idf features, n_samples=%d and n_features=%d..." %
        (len(sentences), maxfeatures))

    nmf = NMF(n_components=components,
              random_state=1,
              beta_loss='kullback-leibler',
              solver='mu',
              max_iter=1000,
              alpha=.1,
              l1_ratio=.5).fit(tfidf)

    print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
    tfidffeaturenames = tfidfvectorizer.get_feature_names()
    print_top_words(nmf, tfidffeaturenames, topwords)

    return
Esempio n. 9
0
def sklearntextfeatureextractionandevaluation(sentences, searchobject):
    """

	see http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_text_feature_extraction.html#sphx-glr-auto-examples-model-selection-grid-search-text-feature-extraction-py

	and

	http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

	:return:
	"""

    sentencesaslists = [s.split(' ') for s in sentences]
    allwordsinorder = [
        item for sublist in sentencesaslists for item in sublist if item
    ]

    morphdict = getrequiredmorphobjects(set(allwordsinorder))
    morphdict = convertmophdicttodict(morphdict)

    headwordsinorder = list()
    for w in allwordsinorder:
        try:
            hwds = [item for item in morphdict[w]]
            headwordsinorder.append('·'.join(hwds))
        except TypeError:
            pass
        except KeyError:
            pass

    pipeline = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier()),
    ])

    parameters = {
        'vect__max_df': (0.5, 0.75, 1.0),
        # 'vect__max_features': (None, 5000, 10000, 50000),
        'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
        # 'tfidf__use_idf': (True, False),
        # 'tfidf__norm': ('l1', 'l2'),
        'clf__alpha': (0.00001, 0.000001),
        'clf__penalty': ('l2', 'elasticnet'),
        # 'clf__n_iter': (10, 50, 80),
    }

    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

    # categories = [
    # 	'alt.atheism',
    # 	'talk.religion.misc',
    # ]
    #
    # dataset = fetch_20newsgroups(subset='train', categories=categories)
    # print(dataset)
    """
	done in 65.998s
	
	Best score: 0.935
	Best parameters set:
		clf__alpha: 1e-05
		clf__penalty: 'l2'
		vect__max_df: 0.75
		vect__ngram_range: (1, 2)
	"""

    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    # grid_search.fit(dataset.data, dataset.target)

    print(sentences)
    grid_search.fit(sentences)
    print("done in %0.3fs" % (time() - t0))
    print()

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    return
Esempio n. 10
0
def generateabsolutevectorsoutput(listsofwords: list, workssearched: list,
                                  searchobject, vtype: str):
    """


	:return:
	"""
    so = searchobject
    vv = so.vectorvalues
    activepoll = so.poll

    # find all words in use
    allwords = findsetofallwords(listsofwords)
    # print('allwords', allwords)

    # find all possible forms of all the words we used
    # consider subtracting some set like: rarewordsthatpretendtobecommon = {}
    activepoll.statusis('Finding headwords')
    morphdict = getrequiredmorphobjects(allwords, furtherdeabbreviate=True)
    morphdict = convertmophdicttodict(morphdict)

    # find all possible headwords of all of the forms in use
    # note that we will not know what we did not know: count unparsed words too and deliver that as info at the end?
    allheadwords = dict()
    for m in morphdict.keys():
        for h in morphdict[m]:
            allheadwords[h] = m

    if so.lemma:
        # set to none for now
        subtractterm = None
    else:
        subtractterm = so.seeking

    activepoll.statusis('Building vectors')
    vectorspace = buildrudimentaryvectorspace(allheadwords,
                                              morphdict,
                                              listsofwords,
                                              subtractterm=subtractterm)

    # for k in vectorspace.keys():
    # 	print(k, vectorspace[k])

    if so.lemma:
        focus = so.lemma.dictionaryentry
    else:
        focus = so.seeking

    activepoll.statusis('Calculating cosine distances')
    cosinevalues = caclulatecosinevalues(focus, vectorspace,
                                         allheadwords.keys())
    # cosinevalues = vectorcosinedispatching(focus, vectorspace, allheadwords.keys())
    # print('generatevectoroutput cosinevalues', cosinevalues)

    # apply the threshold and drop the 'None' items
    threshold = 1.0 - vv.localcutoffdistance
    falseidentity = .02
    cosinevalues = {
        c: 1 - cosinevalues[c]
        for c in cosinevalues
        if cosinevalues[c] and falseidentity < cosinevalues[c] < threshold
    }
    mostsimilar = [(c, cosinevalues[c]) for c in cosinevalues]
    mostsimilar = sorted(mostsimilar, key=lambda t: t[1], reverse=True)

    findshtml = formatnnmatches(mostsimilar, vv)

    # next we look for the interrelationships of the words that are above the threshold
    activepoll.statusis('Calculating metacosine distances')
    imagename = graphbliteraldistancematches(focus, mostsimilar, so)

    findsjs = generatevectorjs()

    output = SearchOutputObject(so)

    output.title = 'Cosine distances to »{skg}«'.format(skg=focus)
    output.found = findshtml
    output.js = findsjs

    if not so.session['cosdistbylineorword']:
        space = 'related terms in {s} {t}'.format(s=len(listsofwords), t=vtype)
    else:
        dist = so.session['proximity']
        scale = {'words': 'word', 'lines': 'line'}
        if int(dist) > 1:
            plural = 's'
        else:
            plural = str()
        space = 'related terms within {a} {b}{s}'.format(
            a=dist, b=scale[so.session['searchscope']], s=plural)

    found = max(vv.neighborscap, len(cosinevalues))
    output.setresultcount(found, space)
    output.setscope(workssearched)

    if so.lemma:
        xtra = 'all forms of '
    else:
        xtra = str()

    output.thesearch = '{x}»{skg}«'.format(x=xtra, skg=focus)
    output.htmlsearch = '{x}<span class="sought">»{skg}«</span>'.format(
        x=xtra, skg=focus)

    output.sortby = 'distance with a cutoff of {c}'.format(
        c=vv.localcutoffdistance)
    output.image = imagename
    output.searchtime = so.getelapsedtime()

    activepoll.deactivate()

    jsonoutput = json.dumps(output.generateoutput())

    return jsonoutput
Esempio n. 11
0
def lsifindmatches(sentencestuples, searchobject, lsispace):
	"""


	:return:
	"""

	so = searchobject
	vv = so.vectorvalues

	activepoll = so.poll

	makespace = lsibuildspace

	if not lsispace:
		# find all words in use
		listsofwords = [s[1] for s in sentencestuples]
		allwords = findsetofallwords(listsofwords)

		# find all possible forms of all the words we used
		# consider subtracting some set like: rarewordsthatpretendtobecommon = {}
		wl = '{:,}'.format(len(listsofwords))
		activepoll.statusis('Finding headwords for {n} sentences'.format(n=wl))

		morphdict = getrequiredmorphobjects(allwords, furtherdeabbreviate=True)
		morphdict = convertmophdicttodict(morphdict)

		# find all possible headwords of all of the forms in use
		# note that we will not know what we did not know: count unparsed words too and deliver that as info at the end?
		allheadwords = dict()
		for m in morphdict.keys():
			for h in morphdict[m]:
				allheadwords[h] = m

		hw = '{:,}'.format(len(allheadwords.keys()))
		activepoll.statusis('Building vectors for {h} headwords in {n} sentences'.format(h=hw, n=wl))

		lsispace = makespace(searchobject, morphdict, listsofwords)
		storevectorindatabase(so, 'lsi', lsispace)

	vectorquerylsi = lsispace.findquerylsi(so.tovectorize)

	vectorindex = MatrixSimilarity(lsispace.semantics)

	similis = vectorindex[vectorquerylsi]
	# print('similis', similis)

	threshold = vv.lemmapaircutoffdistance

	matches = list()
	sims = sorted(enumerate(similis), key=lambda item: -item[1])
	count = 0
	activepoll.statusis('Sifting results')

	if not sentencestuples:
		sentencestuples = vectorprepdispatcher(so)

	dbconnection = ConnectionObject('autocommit')
	cursor = dbconnection.cursor()
	for s in sims:
		if s[1] > threshold:
			thissentence = lsispace.sentences[s[0]]
			# this part is slow and needs MP refactoring?
			# dblines = finddblinefromsentence(thissentence, subsearchobject)
			dblines = finddblinesfromsentences(thissentence, sentencestuples, cursor)
			if dblines:
				if len(dblines) > 1:
					xtra = ' <span class="small">[1 of {n} occurrences]</span>'.format(n=len(dblines))
				else:
					xtra = ''
				dbline = dblines[0]
				count += 1
				thismatch = dict()
				thismatch['count'] = count
				thismatch['score'] = float(s[1])  # s[1] comes back as <class 'numpy.float32'>
				thismatch['line'] = dbline
				thismatch['sentence'] = '{s}{x}'.format(s=' '.join(thissentence), x=xtra)
				thismatch['words'] = lsispace.bagsofwords[s[0]]
				matches.append(thismatch)

	dbconnection.connectioncleanup()

	matches = [m for m in matches if len(m['sentence'].split(' ')) > 2]

	return matches