Ejemplo n.º 1
0
 def construct_term_doc_matrix(self, pca=False):
     '''
     Constructs a term-document matrix such that td_matrix[document][term] 
     contains the weighting score for the term in the document.
     '''
     if not self.filter_terms:    
         corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
     else:
         corpus = nltk.TextCollection(self._filter_terms())
         
     terms = list(set(corpus))
     data_rows = numpy.zeros([len(self.document_dict), len(set(corpus))])
     
     for i, document in enumerate(self.document_dict.values()):
         text = nltk.Text(document.tokens)
         for item in document.word_frequencies:
             data_rows[i][terms.index(item.word)] = corpus.tf_idf(item.word, text)
     
     
     #table = Orange.data.Table("iris.tab")
     self.attributes = terms#table.domain.features
     #a, c, w = table.to_numpy()        
     self.td_matrix = data_rows#a
             
     #If PCA is True then we project our points on their principal components
     #for dimensionality reduction
     if pca:
         t = construct_orange_table(self.attributes, self.td_matrix)
         self.td_matrix = orange_pca(t)
         #Attributes names have no meaning after dimensionality reduction
         self.attributes = [i for i in range(self.td_matrix.shape[1])]
def compute_tf_idf_document_matrix(articles_dict):

    all_articles = range(len(articles_dict))
    for k, v in articles_dict.iteritems():
        text = v['content'].lower().split()
        all_articles[int(k)] = text
        v['tokenized'] = text

    #create a TextCollection corpus from all articles
    #this allows us to perform tf-idf
    tc = nltk.TextCollection(all_articles)

    #this is our target - matrix of all tf-idf values for every word and document
    td_matrix = {}
    for k, v in articles_dict.iteritems():
        post = v['tokenized']
        fdist = nltk.FreqDist(post)

        doc_review_id = v['review_id']
        td_matrix[doc_review_id] = {}

        for term in fdist.iterkeys():
            td_matrix[doc_review_id][term] = tc.tf_idf(term, post)

    return td_matrix
def main():
    f_path = '[change to your googleplus_posts.json location]'
    data = json.loads(open(f_path).read())

    QUERY_TERMS = ['mobile']  # You can change the search terms here

    activities = [
        activity['object']['content'].lower().split() for activity in data
        if activity['object']['content'] != ''
    ]

    # nltk TextCollection has tf-idf itself
    tc = nltk.TextCollection(activities)

    relevant_activities = []

    for i in range(len(activities)):
        score = 0
        for term in QUERY_TERMS:
            score += tc.tf_idf(term.lower(), activities[i])
        if score > 0:
            relevant_activities.append({
                'score': score,
                'title': data[i]['title'],
                'url': data[i]['url']
            })

    relevant_activities = sorted(relevant_activities,
                                 key=lambda a: a['score'],
                                 reverse=True)
    for ra in relevant_activities:
        print 'title: ', ra['title']
        print 'url: ', ra['url']
        print 'score: ', ra['score']
Ejemplo n.º 4
0
    def construct_term_doc_matrix(self, index, document):
        '''
        Overrides the parent method for constructing a td_matrix. The reason is 
        because we want to construct the matrix based on a sliding window approach.
        '''
        if index < self.window:
            documents = self.document_dict.values()
        else:
            window = (index - self.window + 1, index)
            documents = self.document_dict.values()[window[0]:window[1]]

        #Online clustering doesn't support term filtering yet
        corpus = nltk.TextCollection(
            [document.tokens for document in documents])

        terms = list(set(corpus))
        term_vector = numpy.zeros(len(set(corpus)))

        text = nltk.Text(document.tokens)
        for item in document.word_frequencies:
            term_vector[terms.index(item.word)] = corpus.tf_idf(
                item.word, text)

        self.attributes = terms
        self.td_matrix = term_vector
def cluster_texts(texts, clustersNumber, distance):
    # Convierte texto en una coleccion
    # Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    # Para representar los textos como vectores de terminos representativos, cojo los terminos unicos
    # Get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    # TF mide la frecuencia en los textos.
    # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion
    # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion
    vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts]
    print("Vectors created.")
    print(vectors)

    # initialize the clusterer
    clusterer = GAAClusterer(clustersNumber)
    clusters = clusterer.cluster(vectors, True)
    # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn
    #clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
    #                                  linkage="average", affinity=distanceFunction)
    #clusters = clusterer.fit_predict(vectors)

    return clusters
Ejemplo n.º 6
0
def render_wordcloud(form, **kwargs):
    session = Session()
    results = search.search(session, **form.values())
    # Create the corpus from the results
    tknzr = TweetTokenizer()
    texts = []
    for r in results:
        tokens = []
        for sent in sent_tokenize(r.text.strip()):
            tokens += [
                w for w in tknzr.tokenize(sent.strip())
                if w.lower() not in stopwords_en
            ]
        texts.append(tokens)
    corpus = nltk.TextCollection(texts)
    corpus.collocations(100)
    # noinspection PyProtectedMember
    results = {
        'vocabulary': [list(i) for i in corpus.vocab().most_common(1000)],
        'collocations': corpus._collocations,
    }
    view = render_template('./templates/search/results_wordcloud.html',
                           form=form,
                           results=results,
                           **kwargs)
    session.close()
    return view
 def calculate_results(self):
     vocab = nltk.TextCollection(self.articles).vocab().items()
     overall_freqdist = [(fd[0], float(fd[1]) / float(vocab[0][1]))
                         for fd in vocab]
     for city in self.cities:
         self.cities[city]["freqdist"] = self.tf_icf(city)[0:100]
         self.db.save(self.cities[city])
Ejemplo n.º 8
0
def TF_IDF2(documents, dictionary):
    print('tf-idf')
    vectors = []
    i = 0
    # 重新构造文本集
    Texts = []
    for document in documents:
        Text = ''
        for token in document:
            if token in dictionary:
                Text += (' ' + token)
        Texts.append(Text)
        print(i)
        i += 1
    # 加载计算tf-idf类库
    tc = nltk.TextCollection(Texts)
    i = 0
    for document in Texts:
        vector = []
        for item in dictionary:
            # 计算tf-idf
            weight = tc.tf_idf(str(item), document)
            vector.append(weight)
        vectors.append(vector)
        print(i)
        i += 1
    # pd.DataFrame(vectors).to_csv(out, sep=",", header=None, index=None)

    return vectors
Ejemplo n.º 9
0
 def __init__(self, token_list_list):
     '''
     Initialize.
     
     Args:
         token_list_list:    The list of list of tokens.
     '''
     self.__collection = nltk.TextCollection(token_list_list)
def cluster_texts(texts, clustersNumber, distanceFunction, clusterMode):
    """
    Function to cluster several texts. The following inputs must be
    specified:
        *) texts: collection of texts to cluster
        *) clustersNumber: number of clusters to be used
        *) distanceFunction: distance function to be used by the
           clustering algorithms
        *) clusterMode: cluster mode to be used:"AgglomerativeClustering",
           "KMeans" or "MiniBatchKMeans", all of them belonging to the
           scikit-learn library

    """

    collection = nltk.TextCollection(texts)
    # print("Created a collection of", len(collection), "terms.")

    # Get a list of unique terms
    unique_terms = list(set(collection))
    # print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    # TF mide la frecuencia en los textos.
    # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion
    # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion
    vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts]
    # print("Vectors created.")
    # print(vectors)

    # for vector in vectors:
    # print("Vector ", len(vector))

    # initialize the clusterer
    # clusterer = GAAClusterer(clustersNumber)
    # clusters = clusterer.cluster(vectors, True)
    # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn

    if clusterMode == "AgglomerativeClustering":

        clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
                                            linkage="average",
                                            affinity=distanceFunction)
        clusters = clusterer.fit_predict(vectors)

    elif clusterMode == "KMeans":

        clusterer = KMeans(n_clusters=clustersNumber, random_state=0)
        clusters = clusterer.fit(vectors).predict(vectors)

    elif clusterMode == "MiniBatchKMeans":

        clusterer = MiniBatchKMeans(n_clusters=clustersNumber, random_state=0)
        clusters = clusterer.fit(vectors).predict(vectors)
    else:
        print("Invalid cluster mode")
        return None

    return clusters
Ejemplo n.º 11
0
 def get_most_frequent_terms(self, N=5):
     '''
     Returns the top N occuring terms in this cluster.
     '''
     if self.top_patterns != None:
         return self.top_patterns
     else:
         corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
         return nltk.FreqDist(corpus).items()[:N]     
Ejemplo n.º 12
0
def tfidf(doc, docs):
    """対象の文書と全文の形態素解析した単語リストを指定すると対象の文書のTF-IDFを返す"""
    tokens = list(chain.from_iterable(docs))  #flatten
    A = nltk.TextCollection(docs)
    token_types = set(tokens)
    return [{
        "word": token_type,
        "tfidf": A.tf_idf(token_type, doc)
    } for token_type in token_types]
Ejemplo n.º 13
0
def tf_idf(docs):
    tokens = []
    for doc in docs:
        tokens += doc
    tf_idf = {}
    A = nltk.TextCollection(docs)
    token_types = set(tokens)
    for token_type in token_types:
        #print token_type,'=', A.tf_idf(token_type,tokens)
        tf_idf[token_type] = A.tf_idf(token_type, tokens)
    return tf_idf
Ejemplo n.º 14
0
def get_tf(docid, term, index):
    if is_phrase_term(term):
        # if it's a phrase, return error
        return "Not valid term, can not be term"
    else:
        if docid in index._doc_contents:
            doc = nltk.Text(nltk.word_tokenize(index._doc_contents[docid]))
            col = nltk.TextCollection([doc])
            return col.tf(term, doc)
        else:
            return "Not Found"
Ejemplo n.º 15
0
    def _calculate_centroid(self):
        '''
        It calculates the centroid of this collection of documents.
        '''
        corpus = nltk.TextCollection([document.tokens for document in self.documents.values()])
        terms = list(set(corpus))

        centroid = numpy.zeros([len(self.documents.items()), len(terms)])
        for i, document in enumerate(self.documents.values()):
            centroid[i] = document.fv

        self.centroid = numpy.mean(centroid, axis=0)
Ejemplo n.º 16
0
 def get_collocations(self, n=2, N=5):
     '''
     Returns the top collocations of the cluster corpus 
     based on Jaccard index. The collocations correspond 
     to n-grams and more specifically we limited the options
     to bigrams (n=2) and trigrams (n=3) ( n defaults to 2 ). 
     '''
     corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
     finder = nltk.BigramCollocationFinder.from_words(corpus)
     scorer = nltk.metrics.BigramAssocMeasures.jaccard
     #finder.apply_freq_filter(3)
     finder.apply_word_filter(lambda w:w in nltk.corpus.stopwords.words('english'))
     collocations = finder.nbest(scorer, N)
Ejemplo n.º 17
0
    def _attach_feature_vectors(self):
        '''
        Iterates over the summarizer documents and calculates a tf-idf
        weighted feature vector for each document. The feature vectors is
        attached to the document.
        '''
        corpus = nltk.TextCollection([document.tokens for document in self.documents.values()])
        terms = list(set(corpus))

        for id, document in self.documents.iteritems():
            text = nltk.Text(document.tokens)
            fv = numpy.zeros([len(set(corpus))])
            for item in document.word_frequencies:
                fv[terms.index(item.word)] = corpus.tf_idf(item.word, text)
            self.documents[id].fv = fv
Ejemplo n.º 18
0
def convertToTexts():
    print("Converting clean files to text collection...")
    textList = []
    for filename in os.listdir(os.getcwd()):
        if "c_" in filename:
            file = open(filename, 'r', encoding='utf-8')
            text = file.read().lower()
            text = re.sub('[^\w\s]', ' ', text)
            tokens = nltk.word_tokenize(text)
            tokens = remove_stopwords(tokens)
            text = nltk.Text(tokens)
            textList.append(text)
            file.close()
    print("Finished converting clean files to Text collection")
    return [nltk.TextCollection(textList), textList]
Ejemplo n.º 19
0
def tfidf(word):
    collection = nltk.TextCollection(word)
    doc = []
    for do in word:
        wo = []
        for term in set(do):
            a = collection.tf_idf(term, do)
            if a > 0:
                wo.append([term, a])
        wo.sort(key=lambda x: x[1])
        wo.reverse()
        slice1 = [i[0] for i in wo]
        lists = slice1[:20]
        doc.append(list(lists))

    return doc
Ejemplo n.º 20
0
def tf_idf(sentence, resources):
    result = []
    filename = resources["corpus"]
    file = open(filename)
    data = file.read()
    file.close()
    print("Finished reading file....")

    #data = data.decode("utf-8")
    line = data.split("\n")

    # 与えられた文章を形態素解析
    mt = MeCab.Tagger(dic_path)
    mt.parse('')
    res = mt.parseToNode(sentence)

    elements = []
    while res:
        ft = res.feature.split(",")
        #elements.append(res.surface.decode("utf-8"))
        elements.append(res.surface)
        #print res.surface, res.feature
        res = res.next

    print("Finished morphological analysis....")

    elements = elements[1:-1]

    docs = []
    docs.append(elements)

    for l in line:
        docs.append(l.split(" "))

    print("Finished spliting word....")

    collection = nltk.TextCollection(docs)
    uniqTerms = list(set(collection))

    for term in elements:
        #print("%s : %f" % (term, collection.tf_idf(term, elements)))
        result.append((term.encode("utf-8"), collection.tf_idf(term,
                                                               elements)))

    result = sorted(result, reverse=True, key=lambda x: float(x[1]))
    return result
Ejemplo n.º 21
0
def TFIDF(document):
    dokumen = ''
    kum_kata = set()
    for dokumen in document:
        kum_kata = kum_kata.union(set(
            dokumen.split(' ')))  #proses penggabungan
    kum_kata = sorted(kum_kata)
    collection = nltk.TextCollection(
        kum_kata)  #mengurutkan kumpulan kata berdasarkan abjad
    unique_terms = list(collection)  #print list(collection)
    word_tfidf = []
    for word in unique_terms:
        word_tfidf.append(collection.tf_idf(word, document))
    # file = open("TF_IDF.txt", "wb")
    # file.write("%s " %kum_kata + "%s\n" %word_tfidf)
    # file.close()
    return word_tfidf
Ejemplo n.º 22
0
def question_match_tf_idf(data_question1, data_question2):
    """Calculate the match rate between two questions based on TF_IDF"""
    # Calculate IDF
    question_corpus = []
    question_corpus.extend(data_question1.tolist())
    question_corpus.extend(data_question2.tolist())
    text_collection = nltk.TextCollection(question_corpus)
    weights = {
        word: text_collection.idf(word)
        for word in text_collection.tokens
    }

    # Calculate the match rate
    result = []
    for question1, question2 in zip(data_question1, data_question2):
        result.append(match_rate_tf_idf(question1, question2, weights))
    return result
Ejemplo n.º 23
0
    def create_index(self, documentos):
        listaTextos = []
        for d in documentos:
            listaTextos.append(
                nltk.wordpunct_tokenize(
                    nltk.clean_html(d.texto.encode('utf-8'))))

        for d in documentos:
            tokens = nltk.wordpunct_tokenize(nltk.clean_html(d.texto))
            tokens = [token.lower() for token in tokens]
            frequencency = nltk.FreqDist(tokens)
            for i in frequencency.items():
                termo = self.remove_punctuation(i[0])
                if len(termo) > 0:
                    tc = nltk.TextCollection(listaTextos)
                    tf_idf = tc.tf_idf(termo, d.texto)
                    achou = False
                    index = 0
                    for c in self.contents:
                        index += 1
                        if c.termo == termo:
                            achou = True
                            break
                    content = Content()
                    content.termo = termo
                    if not achou:
                        content.urls.append(url=d.url,
                                            tf_idf=tf_idf,
                                            frequencia=i[1])
                        self.contents.append(content)
                    else:
                        try:
                            self.contents[index].urls.append(url=d.url,
                                                             tf_idf=tf_idf,
                                                             frequencia=i[1])
                        except:
                            print 'Nao foi possivel adicionar um termo'
                '''chave = KeyValue(i[0],d.url,tf_idf)
				if self.hashTable.lookup(chave):
					self.hashTable.append(chave)
				else:
					self.hashTable.add(chave)'''
        return self.contents
Ejemplo n.º 24
0
    def load_possible_terms(self, np_text_list):
        """
			Retrieve possible words/terms from numpy list of text

			Args:
				np_text_list(np(list(string))): Numpy list containing text which term to be extracted
		"""

        temp_word_list = np.array([])

        for text in np_text_list:
            text = StringManipulator.normalize_text(text)
            temp_word_list = np.append(
                temp_word_list, StringManipulator.retrieve_unique_words(text))

        self.word_list = np.append(self.word_list, temp_word_list)
        self.word_list = np.unique(self.word_list)

        self.text_collection = nltk.TextCollection(self.word_list)
def cluster_texts(texts, clustersNumber, distance):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    #get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    vectors = [numpy.array(TF(f,unique_terms, collection)) for f in texts]
    print("Vectors created.")

    # initialize the clusterer
    clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
                                      linkage="average", affinity=distanceFunction) # esto se deja as
    clusters = clusterer.fit_predict(vectors) # que este predict sea parecido a reference

    return clusters
 def getRelevantNews(self):
     # Определите здесь свой запрос
     QUERY_TERMS = ['стол', 'кубка', 'регион']
     # получаем массив новостей
     self.news = self.getNews()
     # Textcollection определяет абстракции tf, idf и tf_idf,
     # поэтому нам не требуется определять свои версии
     tc = nltk.TextCollection(self.news)
     relevant = []
     for idx in range(len(self.news)):
         score = 1
         for term in [t.lower() for t in QUERY_TERMS]:
             score += tc.tf_idf(term, self.news[idx])
         if score > 0:
             relevant.append({'score': score, 'title': self.news[idx]})
     # Сортировать результаты по релевантности и выводим
     relevants = sorted(relevant, key=lambda p: p['score'], reverse=True)
     for post in relevants:
         print('{0}'.format(post['title']))
     return relevants
def cluster_texts(texts, cluster_number, distance, verbose=True, measure=TF):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)

    #get a list of unique terms
    unique_terms = list(set(collection))

    if verbose:
        print("Creando collecion de %d terminos" % len(collection))
        print("Terminos unicos encontrados: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    vectors = [numpy.array(measure(f,unique_terms, collection)) for f in texts]

    # initialize the clusterer
    clusterer = AgglomerativeClustering(n_clusters=cluster_number,
                                      linkage="average", affinity='cosine')
    clusters = clusterer.fit_predict(vectors)

    return clusters
Ejemplo n.º 28
0
def getTDMatrix(textCorpus):
    all_articles = [article['text'].lower().split() for article in textCorpus]

    tc = nltk.TextCollection(all_articles)

    # Compute a term-document matrix such that td_matrix[doc_title][term]
    # returns a tf-idf score for the term in the document
    td_matrix = {}
    i = 0
    for idx in range(len(all_articles)):
        i += 1
        print i
        article = all_articles[idx]
        fdist = nltk.FreqDist(article)
        doc_title = textCorpus[idx]['author']
        td_matrix[doc_title] = {}
        # takes long..
        for term in fdist.iterkeys():
            td_matrix[doc_title][term] = tc.tf_idf(term, article)
    return td_matrix
def cluster_texts(texts, clustersNumber, distance):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of {0}, terms.".format(len(collection)))
    #get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))
    ### And here we actually call the function and create our array of vectors.
    vectors_tf_idf = [
        numpy.array(TF_IDF(f, unique_terms, collection)) for f in texts
    ]

    vectors_idf = [
        numpy.array(IDF(f, unique_terms, collection)) for f in texts
    ]
    print("Vectors created.")
    # initialize the clusterer
    cluster = AgglomerativeClustering(n_clusters=clustersNumber,
                                      linkage="average",
                                      affinity=distance)
    clusters_tfidf = cluster.fit_predict(vectors_tf_idf)
    clusters_idf = cluster.fit_predict(vectors_idf)
    return (clusters_tfidf, clusters_idf)
Ejemplo n.º 30
0
def cluster_texts(texts, clustersNumber, distance):

    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    # Get a list of unique terms
    unique_terms = list(set(collection))

    print("Unique terms found: ", len(unique_terms))

    # And here we actually call the function and create our array of vectors.
    vectors = [
        numpy.array(TF_IDF(f, unique_terms, collection)) for f in texts
    ]  # NUEVO
    print("Vectors created.")

    # Initialize the clusterer -> classify the words into groups
    clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
                                        linkage="average",
                                        affinity=distanceFunction)
    clusters = clusterer.fit_predict(vectors)

    return clusters