Ejemplos de getCustomStopWords en Python

Lenguaje de programación: Python

Namespace/Package Name: topicmap

Método / Función: getCustomStopWords

Ejemplos en hotexamples.com: 5

Python getCustomStopWords - 5 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de topicmap.getCustomStopWords extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Ejemplo n.º 1

Mostrar archivo

def testCase():
    """
    Get docs from file, get list of titles+content, calculate (40) topics using 3,4 ngrams
    map topics Add list of topics to each entry of the given allEntryDict for each topic
    that has an LDA fuzzy relevance (see fuzzywuzzy process) of greater than the
    specified threshold. Calculate SoftCosine-Similarity matrix with WordEmbeddings
    fasttext_model300 (dimension 300) or GloVe (dimension 50)
    save matrix to file
    do spectral analysis and dimension reduction (PCA method) on similarity matrix
    plotScatter3D with tool tips
    """
    allDict = loadAllFeedsFromFile()
    sm = smallDict(allDict, 500)
    docl = getDocList(sm,
                      limit=None,
                      reloaddocs=False,
                      stop_list=getCustomStopWords())
    topics = deriveTopicMaps(docl, maxNum=20, ngram_range=(3, 4))
    updateDictionaryByFuzzyRelevanceofTopics(topics,
                                             sm,
                                             limit=None,
                                             threshold=60,
                                             remove=True)
    trix = deriveSoftCosineSimilarityMatrix(sm)
    saveDFPickle(trix)
    do3DPlotOfCosineSimilarity(sm, None, trix)
    return

Ejemplo n.º 2

Mostrar archivo

def preparePyLDAvisData(allDict, limit=None, numTopics=30):

    docsZip = getDocList(allDict,
                         limit,
                         stop_list=getCustomStopWords(),
                         with_ids=True)
    documents = []
    ids = []
    for i, j in docsZip:
        documents.append(j)
        ids.append(i)
    # Create gensim Dictionary of unique IDs of all words in all documents
    # pyDAVis param "d"
    dictionary = corpora.Dictionary(
        [simple_preprocess(doc) for doc in documents])

    # Convert the sentences into bag-of-words vectors.
    sentences = []  # pyDAVis param "c"
    for doc in documents:
        sentences.append(dictionary.doc2bow(simple_preprocess(doc)))

    ldamodel = models.ldamodel.LdaModel(sentences,
                                        num_topics=numTopics,
                                        id2word=dictionary,
                                        passes=50)

    return (ldamodel, sentences, dictionary)

Ejemplo n.º 3

Mostrar archivo

Archivo: gensim_test.py Proyecto: jb-diplom/UvA-Papers

def preparePyLDAvisData(allDict, limit=None, numTopics=30):

    docsZip = getDocList(allDict,
                         limit,
                         stop_list=getCustomStopWords(),
                         with_ids=True)
    documents = []
    ids = []
    for i, j in docsZip:
        documents.append(j)
        ids.append(i)
    model = getWordEmbeddingModel()
    # Create gensim Dictionary of unique IDs of all words in all documents
    # pyDAVis param "d"
    dictionary = corpora.Dictionary(
        [simple_preprocess(doc) for doc in documents])

    # Prepare the similarity matrix
    # TODO Check if some of these parameters can be used to begin with rather than filtering later
    similarity_matrix = model.similarity_matrix(dictionary,
                                                tfidf=None,
                                                threshold=0.0,
                                                exponent=2.0,
                                                nonzero_limit=100)
    # Convert the sentences into bag-of-words vectors.
    sentences = []  # pyDAVis param "c"
    for doc in documents:
        sentences.append(dictionary.doc2bow(simple_preprocess(doc)))

    ldamodel = models.ldamodel.LdaModel(sentences,
                                        num_topics=numTopics,
                                        id2word=dictionary,
                                        passes=50)

    return (ldamodel, sentences, dictionary)

Ejemplo n.º 4

Mostrar archivo

def deriveSoftCosineSimilarityMatrix(allDict,
                                     limit=None,
                                     weName="glove-wiki-gigaword-50",
                                     simThreshold=0.3):
    # documents=getTestDocuments()
    docsZip = getDocList(allDict,
                         limit,
                         stop_list=getCustomStopWords(),
                         with_ids=True)

    documents = []
    ids = []
    for i, j in docsZip:
        documents.append(j)
        ids.append(i)
    model = getWordEmbeddingModel(weName=weName)
    # Create gensim Dictionary of unique IDs of all words in all documents
    # pyDAVis param "d"
    dictionary = corpora.Dictionary(
        [simple_preprocess(doc) for doc in documents])

    # Convert the sentences into bag-of-words vectors.
    sentences = []  # pyDAVis param "c"
    for doc in documents:
        sentences.append(dictionary.doc2bow(simple_preprocess(doc)))

    # Create a TF-IDF model. TF-IDF encoding represents words as their
    # relative importance to the whole document in a collection of documents,
    # i.e. the sentences.
    # pyDAVis param "lda"
    tf_idf = models.TfidfModel(sentences)

    # Prepare the similarity matrix
    similarity_matrix = model.similarity_matrix(dictionary,
                                                tfidf=tf_idf,
                                                threshold=simThreshold,
                                                exponent=2.0,
                                                nonzero_limit=100)

    # create 1xN vector filled with 1,2,..N
    len_array = np.arange(len(sentences))
    # create NxN array filled with 1..N down, 1..N across
    xx, yy = np.meshgrid(len_array, len_array)
    # Iterate over the 2d matrix calculating
    theMatrix = [[
        round(softcossim(sentences[i], sentences[j], similarity_matrix), 2)
        for i, j in zip(x, y)
    ] for y, x in zip(xx, yy)]

    cossim_mat = pd.DataFrame(theMatrix, index=ids, columns=ids)

    return cossim_mat

Ejemplo n.º 5

Mostrar archivo

Archivo: scatterplots.py Proyecto: jb-diplom/Big-Data

def testDisplayTopics(numArticles=None, numTopics=30, dict=None):
    if not dict:
        dict = loadAllFeedsFromFile()
    if bool(numArticles):
        small = smallDict(dict, numArticles)
    else:
        small = dict
    docl = getDocList(small, reloaddocs=False, stop_list=getCustomStopWords())
    # docl=getDocList(small, reloaddocs=False)
    topics = deriveTopicMaps(docl, maxNum=numTopics, ngram_range=(3, 3))
    updateDictionaryByFuzzyRelevanceofTopics(topics, small, limit=30)
    displayTopics(topics)
    return