Ejemplo n.º 1
 def construct_term_doc_matrix(self, pca=False):
     Constructs a term-document matrix such that td_matrix[document][term] 
     contains the weighting score for the term in the document.
     if not self.filter_terms:    
         corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
         corpus = nltk.TextCollection(self._filter_terms())
     terms = list(set(corpus))
     data_rows = numpy.zeros([len(self.document_dict), len(set(corpus))])
     for i, document in enumerate(self.document_dict.values()):
         text = nltk.Text(document.tokens)
         for item in document.word_frequencies:
             data_rows[i][terms.index(item.word)] = corpus.tf_idf(item.word, text)
     #table = Orange.data.Table("iris.tab")
     self.attributes = terms#table.domain.features
     #a, c, w = table.to_numpy()        
     self.td_matrix = data_rows#a
     #If PCA is True then we project our points on their principal components
     #for dimensionality reduction
     if pca:
         t = construct_orange_table(self.attributes, self.td_matrix)
         self.td_matrix = orange_pca(t)
         #Attributes names have no meaning after dimensionality reduction
         self.attributes = [i for i in range(self.td_matrix.shape[1])]
def compute_tf_idf_document_matrix(articles_dict):

    all_articles = range(len(articles_dict))
    for k, v in articles_dict.iteritems():
        text = v['content'].lower().split()
        all_articles[int(k)] = text
        v['tokenized'] = text

    #create a TextCollection corpus from all articles
    #this allows us to perform tf-idf
    tc = nltk.TextCollection(all_articles)

    #this is our target - matrix of all tf-idf values for every word and document
    td_matrix = {}
    for k, v in articles_dict.iteritems():
        post = v['tokenized']
        fdist = nltk.FreqDist(post)

        doc_review_id = v['review_id']
        td_matrix[doc_review_id] = {}

        for term in fdist.iterkeys():
            td_matrix[doc_review_id][term] = tc.tf_idf(term, post)

    return td_matrix
def main():
    f_path = '[change to your googleplus_posts.json location]'
    data = json.loads(open(f_path).read())

    QUERY_TERMS = ['mobile']  # You can change the search terms here

    activities = [
        activity['object']['content'].lower().split() for activity in data
        if activity['object']['content'] != ''

    # nltk TextCollection has tf-idf itself
    tc = nltk.TextCollection(activities)

    relevant_activities = []

    for i in range(len(activities)):
        score = 0
        for term in QUERY_TERMS:
            score += tc.tf_idf(term.lower(), activities[i])
        if score > 0:
                'score': score,
                'title': data[i]['title'],
                'url': data[i]['url']

    relevant_activities = sorted(relevant_activities,
                                 key=lambda a: a['score'],
    for ra in relevant_activities:
        print 'title: ', ra['title']
        print 'url: ', ra['url']
        print 'score: ', ra['score']
Ejemplo n.º 4
    def construct_term_doc_matrix(self, index, document):
        Overrides the parent method for constructing a td_matrix. The reason is 
        because we want to construct the matrix based on a sliding window approach.
        if index < self.window:
            documents = self.document_dict.values()
            window = (index - self.window + 1, index)
            documents = self.document_dict.values()[window[0]:window[1]]

        #Online clustering doesn't support term filtering yet
        corpus = nltk.TextCollection(
            [document.tokens for document in documents])

        terms = list(set(corpus))
        term_vector = numpy.zeros(len(set(corpus)))

        text = nltk.Text(document.tokens)
        for item in document.word_frequencies:
            term_vector[terms.index(item.word)] = corpus.tf_idf(
                item.word, text)

        self.attributes = terms
        self.td_matrix = term_vector
def cluster_texts(texts, clustersNumber, distance):
    # Convierte texto en una coleccion
    # Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    # Para representar los textos como vectores de terminos representativos, cojo los terminos unicos
    # Get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    # TF mide la frecuencia en los textos.
    # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion
    # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion
    vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts]
    print("Vectors created.")

    # initialize the clusterer
    clusterer = GAAClusterer(clustersNumber)
    clusters = clusterer.cluster(vectors, True)
    # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn
    #clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
    #                                  linkage="average", affinity=distanceFunction)
    #clusters = clusterer.fit_predict(vectors)

    return clusters
Ejemplo n.º 6
def render_wordcloud(form, **kwargs):
    session = Session()
    results = search.search(session, **form.values())
    # Create the corpus from the results
    tknzr = TweetTokenizer()
    texts = []
    for r in results:
        tokens = []
        for sent in sent_tokenize(r.text.strip()):
            tokens += [
                w for w in tknzr.tokenize(sent.strip())
                if w.lower() not in stopwords_en
    corpus = nltk.TextCollection(texts)
    # noinspection PyProtectedMember
    results = {
        'vocabulary': [list(i) for i in corpus.vocab().most_common(1000)],
        'collocations': corpus._collocations,
    view = render_template('./templates/search/results_wordcloud.html',
    return view
 def calculate_results(self):
     vocab = nltk.TextCollection(self.articles).vocab().items()
     overall_freqdist = [(fd[0], float(fd[1]) / float(vocab[0][1]))
                         for fd in vocab]
     for city in self.cities:
         self.cities[city]["freqdist"] = self.tf_icf(city)[0:100]
Ejemplo n.º 8
def TF_IDF2(documents, dictionary):
    vectors = []
    i = 0
    # 重新构造文本集
    Texts = []
    for document in documents:
        Text = ''
        for token in document:
            if token in dictionary:
                Text += (' ' + token)
        i += 1
    # 加载计算tf-idf类库
    tc = nltk.TextCollection(Texts)
    i = 0
    for document in Texts:
        vector = []
        for item in dictionary:
            # 计算tf-idf
            weight = tc.tf_idf(str(item), document)
        i += 1
    # pd.DataFrame(vectors).to_csv(out, sep=",", header=None, index=None)

    return vectors
Ejemplo n.º 9
 def __init__(self, token_list_list):
         token_list_list:    The list of list of tokens.
     self.__collection = nltk.TextCollection(token_list_list)
def cluster_texts(texts, clustersNumber, distanceFunction, clusterMode):
    Function to cluster several texts. The following inputs must be
        *) texts: collection of texts to cluster
        *) clustersNumber: number of clusters to be used
        *) distanceFunction: distance function to be used by the
           clustering algorithms
        *) clusterMode: cluster mode to be used:"AgglomerativeClustering",
           "KMeans" or "MiniBatchKMeans", all of them belonging to the
           scikit-learn library


    collection = nltk.TextCollection(texts)
    # print("Created a collection of", len(collection), "terms.")

    # Get a list of unique terms
    unique_terms = list(set(collection))
    # print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    # TF mide la frecuencia en los textos.
    # Mira de los terminos unicos, cuantas veces aparece en el documento. No mira cuantas veces aparece en la coleccion
    # Hay otras medidas, como TF-IDF que son mas precisas porque tambien miran cuantas veces aparece en la coleccion
    vectors = [numpy.array(TF(f, unique_terms, collection)) for f in texts]
    # print("Vectors created.")
    # print(vectors)

    # for vector in vectors:
    # print("Vector ", len(vector))

    # initialize the clusterer
    # clusterer = GAAClusterer(clustersNumber)
    # clusters = clusterer.cluster(vectors, True)
    # Estas lineas siguientes comentadas es lo mismo pero con otra libreria, la llamada scikit-learn

    if clusterMode == "AgglomerativeClustering":

        clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
        clusters = clusterer.fit_predict(vectors)

    elif clusterMode == "KMeans":

        clusterer = KMeans(n_clusters=clustersNumber, random_state=0)
        clusters = clusterer.fit(vectors).predict(vectors)

    elif clusterMode == "MiniBatchKMeans":

        clusterer = MiniBatchKMeans(n_clusters=clustersNumber, random_state=0)
        clusters = clusterer.fit(vectors).predict(vectors)
        print("Invalid cluster mode")
        return None

    return clusters
Ejemplo n.º 11
 def get_most_frequent_terms(self, N=5):
     Returns the top N occuring terms in this cluster.
     if self.top_patterns != None:
         return self.top_patterns
         corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
         return nltk.FreqDist(corpus).items()[:N]     
Ejemplo n.º 12
def tfidf(doc, docs):
    tokens = list(chain.from_iterable(docs))  #flatten
    A = nltk.TextCollection(docs)
    token_types = set(tokens)
    return [{
        "word": token_type,
        "tfidf": A.tf_idf(token_type, doc)
    } for token_type in token_types]
Ejemplo n.º 13
def tf_idf(docs):
    tokens = []
    for doc in docs:
        tokens += doc
    tf_idf = {}
    A = nltk.TextCollection(docs)
    token_types = set(tokens)
    for token_type in token_types:
        #print token_type,'=', A.tf_idf(token_type,tokens)
        tf_idf[token_type] = A.tf_idf(token_type, tokens)
    return tf_idf
Ejemplo n.º 14
def get_tf(docid, term, index):
    if is_phrase_term(term):
        # if it's a phrase, return error
        return "Not valid term, can not be term"
        if docid in index._doc_contents:
            doc = nltk.Text(nltk.word_tokenize(index._doc_contents[docid]))
            col = nltk.TextCollection([doc])
            return col.tf(term, doc)
            return "Not Found"
Ejemplo n.º 15
    def _calculate_centroid(self):
        It calculates the centroid of this collection of documents.
        corpus = nltk.TextCollection([document.tokens for document in self.documents.values()])
        terms = list(set(corpus))

        centroid = numpy.zeros([len(self.documents.items()), len(terms)])
        for i, document in enumerate(self.documents.values()):
            centroid[i] = document.fv

        self.centroid = numpy.mean(centroid, axis=0)
Ejemplo n.º 16
 def get_collocations(self, n=2, N=5):
     Returns the top collocations of the cluster corpus 
     based on Jaccard index. The collocations correspond 
     to n-grams and more specifically we limited the options
     to bigrams (n=2) and trigrams (n=3) ( n defaults to 2 ). 
     corpus = nltk.TextCollection([document.tokens for document in self.document_dict.values()])
     finder = nltk.BigramCollocationFinder.from_words(corpus)
     scorer = nltk.metrics.BigramAssocMeasures.jaccard
     finder.apply_word_filter(lambda w:w in nltk.corpus.stopwords.words('english'))
     collocations = finder.nbest(scorer, N)
Ejemplo n.º 17
    def _attach_feature_vectors(self):
        Iterates over the summarizer documents and calculates a tf-idf
        weighted feature vector for each document. The feature vectors is
        attached to the document.
        corpus = nltk.TextCollection([document.tokens for document in self.documents.values()])
        terms = list(set(corpus))

        for id, document in self.documents.iteritems():
            text = nltk.Text(document.tokens)
            fv = numpy.zeros([len(set(corpus))])
            for item in document.word_frequencies:
                fv[terms.index(item.word)] = corpus.tf_idf(item.word, text)
            self.documents[id].fv = fv
Ejemplo n.º 18
def convertToTexts():
    print("Converting clean files to text collection...")
    textList = []
    for filename in os.listdir(os.getcwd()):
        if "c_" in filename:
            file = open(filename, 'r', encoding='utf-8')
            text = file.read().lower()
            text = re.sub('[^\w\s]', ' ', text)
            tokens = nltk.word_tokenize(text)
            tokens = remove_stopwords(tokens)
            text = nltk.Text(tokens)
    print("Finished converting clean files to Text collection")
    return [nltk.TextCollection(textList), textList]
Ejemplo n.º 19
def tfidf(word):
    collection = nltk.TextCollection(word)
    doc = []
    for do in word:
        wo = []
        for term in set(do):
            a = collection.tf_idf(term, do)
            if a > 0:
                wo.append([term, a])
        wo.sort(key=lambda x: x[1])
        slice1 = [i[0] for i in wo]
        lists = slice1[:20]

    return doc
Ejemplo n.º 20
def tf_idf(sentence, resources):
    result = []
    filename = resources["corpus"]
    file = open(filename)
    data = file.read()
    print("Finished reading file....")

    #data = data.decode("utf-8")
    line = data.split("\n")

    # 与えられた文章を形態素解析
    mt = MeCab.Tagger(dic_path)
    res = mt.parseToNode(sentence)

    elements = []
    while res:
        ft = res.feature.split(",")
        #print res.surface, res.feature
        res = res.next

    print("Finished morphological analysis....")

    elements = elements[1:-1]

    docs = []

    for l in line:
        docs.append(l.split(" "))

    print("Finished spliting word....")

    collection = nltk.TextCollection(docs)
    uniqTerms = list(set(collection))

    for term in elements:
        #print("%s : %f" % (term, collection.tf_idf(term, elements)))
        result.append((term.encode("utf-8"), collection.tf_idf(term,

    result = sorted(result, reverse=True, key=lambda x: float(x[1]))
    return result
Ejemplo n.º 21
def TFIDF(document):
    dokumen = ''
    kum_kata = set()
    for dokumen in document:
        kum_kata = kum_kata.union(set(
            dokumen.split(' ')))  #proses penggabungan
    kum_kata = sorted(kum_kata)
    collection = nltk.TextCollection(
        kum_kata)  #mengurutkan kumpulan kata berdasarkan abjad
    unique_terms = list(collection)  #print list(collection)
    word_tfidf = []
    for word in unique_terms:
        word_tfidf.append(collection.tf_idf(word, document))
    # file = open("TF_IDF.txt", "wb")
    # file.write("%s " %kum_kata + "%s\n" %word_tfidf)
    # file.close()
    return word_tfidf
Ejemplo n.º 22
def question_match_tf_idf(data_question1, data_question2):
    """Calculate the match rate between two questions based on TF_IDF"""
    # Calculate IDF
    question_corpus = []
    text_collection = nltk.TextCollection(question_corpus)
    weights = {
        word: text_collection.idf(word)
        for word in text_collection.tokens

    # Calculate the match rate
    result = []
    for question1, question2 in zip(data_question1, data_question2):
        result.append(match_rate_tf_idf(question1, question2, weights))
    return result
Ejemplo n.º 23
    def create_index(self, documentos):
        listaTextos = []
        for d in documentos:

        for d in documentos:
            tokens = nltk.wordpunct_tokenize(nltk.clean_html(d.texto))
            tokens = [token.lower() for token in tokens]
            frequencency = nltk.FreqDist(tokens)
            for i in frequencency.items():
                termo = self.remove_punctuation(i[0])
                if len(termo) > 0:
                    tc = nltk.TextCollection(listaTextos)
                    tf_idf = tc.tf_idf(termo, d.texto)
                    achou = False
                    index = 0
                    for c in self.contents:
                        index += 1
                        if c.termo == termo:
                            achou = True
                    content = Content()
                    content.termo = termo
                    if not achou:
                            print 'Nao foi possivel adicionar um termo'
                '''chave = KeyValue(i[0],d.url,tf_idf)
				if self.hashTable.lookup(chave):
        return self.contents
Ejemplo n.º 24
    def load_possible_terms(self, np_text_list):
			Retrieve possible words/terms from numpy list of text

				np_text_list(np(list(string))): Numpy list containing text which term to be extracted

        temp_word_list = np.array([])

        for text in np_text_list:
            text = StringManipulator.normalize_text(text)
            temp_word_list = np.append(
                temp_word_list, StringManipulator.retrieve_unique_words(text))

        self.word_list = np.append(self.word_list, temp_word_list)
        self.word_list = np.unique(self.word_list)

        self.text_collection = nltk.TextCollection(self.word_list)
def cluster_texts(texts, clustersNumber, distance):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    #get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    vectors = [numpy.array(TF(f,unique_terms, collection)) for f in texts]
    print("Vectors created.")

    # initialize the clusterer
    clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
                                      linkage="average", affinity=distanceFunction) # esto se deja as
    clusters = clusterer.fit_predict(vectors) # que este predict sea parecido a reference

    return clusters
 def getRelevantNews(self):
     # Определите здесь свой запрос
     QUERY_TERMS = ['стол', 'кубка', 'регион']
     # получаем массив новостей
     self.news = self.getNews()
     # Textcollection определяет абстракции tf, idf и tf_idf,
     # поэтому нам не требуется определять свои версии
     tc = nltk.TextCollection(self.news)
     relevant = []
     for idx in range(len(self.news)):
         score = 1
         for term in [t.lower() for t in QUERY_TERMS]:
             score += tc.tf_idf(term, self.news[idx])
         if score > 0:
             relevant.append({'score': score, 'title': self.news[idx]})
     # Сортировать результаты по релевантности и выводим
     relevants = sorted(relevant, key=lambda p: p['score'], reverse=True)
     for post in relevants:
     return relevants
def cluster_texts(texts, cluster_number, distance, verbose=True, measure=TF):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)

    #get a list of unique terms
    unique_terms = list(set(collection))

    if verbose:
        print("Creando collecion de %d terminos" % len(collection))
        print("Terminos unicos encontrados: ", len(unique_terms))

    ### And here we actually call the function and create our array of vectors.
    vectors = [numpy.array(measure(f,unique_terms, collection)) for f in texts]

    # initialize the clusterer
    clusterer = AgglomerativeClustering(n_clusters=cluster_number,
                                      linkage="average", affinity='cosine')
    clusters = clusterer.fit_predict(vectors)

    return clusters
Ejemplo n.º 28
def getTDMatrix(textCorpus):
    all_articles = [article['text'].lower().split() for article in textCorpus]

    tc = nltk.TextCollection(all_articles)

    # Compute a term-document matrix such that td_matrix[doc_title][term]
    # returns a tf-idf score for the term in the document
    td_matrix = {}
    i = 0
    for idx in range(len(all_articles)):
        i += 1
        print i
        article = all_articles[idx]
        fdist = nltk.FreqDist(article)
        doc_title = textCorpus[idx]['author']
        td_matrix[doc_title] = {}
        # takes long..
        for term in fdist.iterkeys():
            td_matrix[doc_title][term] = tc.tf_idf(term, article)
    return td_matrix
def cluster_texts(texts, clustersNumber, distance):
    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of {0}, terms.".format(len(collection)))
    #get a list of unique terms
    unique_terms = list(set(collection))
    print("Unique terms found: ", len(unique_terms))
    ### And here we actually call the function and create our array of vectors.
    vectors_tf_idf = [
        numpy.array(TF_IDF(f, unique_terms, collection)) for f in texts

    vectors_idf = [
        numpy.array(IDF(f, unique_terms, collection)) for f in texts
    print("Vectors created.")
    # initialize the clusterer
    cluster = AgglomerativeClustering(n_clusters=clustersNumber,
    clusters_tfidf = cluster.fit_predict(vectors_tf_idf)
    clusters_idf = cluster.fit_predict(vectors_idf)
    return (clusters_tfidf, clusters_idf)
Ejemplo n.º 30
def cluster_texts(texts, clustersNumber, distance):

    #Load the list of texts into a TextCollection object.
    collection = nltk.TextCollection(texts)
    print("Created a collection of", len(collection), "terms.")

    # Get a list of unique terms
    unique_terms = list(set(collection))

    print("Unique terms found: ", len(unique_terms))

    # And here we actually call the function and create our array of vectors.
    vectors = [
        numpy.array(TF_IDF(f, unique_terms, collection)) for f in texts
    ]  # NUEVO
    print("Vectors created.")

    # Initialize the clusterer -> classify the words into groups
    clusterer = AgglomerativeClustering(n_clusters=clustersNumber,
    clusters = clusterer.fit_predict(vectors)

    return clusters