예제 #1
0
파일: article.py 프로젝트: keho98/argos
    def vectorize(self):
        """
        Returns a tuple of vectors representing this article.

        Articles are represented by:
            (bag of words vector, entities vector)
        """
        if self.vectors is None:
            bow_vec = vectorize(self.text)
            ent_vec = vectorize(' '.join(entities(self.text)))
            self.vectors = [bow_vec, ent_vec]
        return self.vectors
예제 #2
0
    def vectorize(self):
        """
        Returns a tuple of vectors representing this article.

        Articles are represented by:
            (bag of words vector, entities vector)
        """
        if self.vectors is None:
            bow_vec = vectorize(self.text)
            ent_vec = vectorize(' '.join(entities(self.text)))
            self.vectors = [bow_vec, ent_vec]
        return self.vectors
예제 #3
0
    def classify(self, docs, num_topics=5):
        """
        Classify a list of documents.

        Args:
            | docs (list)       -- the documents to classify (a list of strings)
            | num_topics (int)  -- number of top predicted topics
                                   to return for each doc.

        Returns:
            | list -- the list of lists of document topics.
        """

        # Returns a 2d array, where each array is
        # a list of probabilities for labels.
        docs_ = vectorize(docs)
        probs = self.clf.predict_proba(docs_)

        # This will sort the *indices* of the inner arrays, instead of the actual values.
        # These indices correspond with labels.
        # It goes from low to high.
        probs_sorted = probs.argsort()

        # Slice all the inner arrays to get `num_topics` top probabilities (their indices).
        probs_top = probs_sorted[:, -num_topics:]

        # Convert the indices to the actual labels, and return.
        return [self.clf.classes_[probs_indices] for prob_indices in top_probs]
예제 #4
0
파일: classify.py 프로젝트: keho98/argos
    def classify(self, docs, num_topics=5):
        """
        Classify a list of documents.

        Args:
            | docs (list)       -- the documents to classify (a list of strings)
            | num_topics (int)  -- number of top predicted topics
                                   to return for each doc.

        Returns:
            | list -- the list of lists of document topics.
        """

        # Returns a 2d array, where each array is
        # a list of probabilities for labels.
        docs_ = vectorize(docs)
        probs = self.clf.predict_proba(docs_)

        # This will sort the *indices* of the inner arrays, instead of the actual values.
        # These indices correspond with labels.
        # It goes from low to high.
        probs_sorted = probs.argsort()

        # Slice all the inner arrays to get `num_topics` top probabilities (their indices).
        probs_top = probs_sorted[:, -num_topics:]

        # Convert the indices to the actual labels, and return.
        return [self.clf.classes_[probs_indices] for prob_indices in top_probs]
예제 #5
0
    def train(self, docs, labels):
        """
        Train the classifier with documents and labels.
        The training can be online. That is, an existing
        classifier can be updated with new training data.

        Args:
            | docs (list)       -- the documents to train on (a list of strings)
            | labels (list)     -- the labels to train on (a list of lists of strings)
        """
        docs_ = vectorize(docs)
        self.clf.partial_fit(docs_, labels)
예제 #6
0
파일: classify.py 프로젝트: keho98/argos
    def train(self, docs, labels):
        """
        Train the classifier with documents and labels.
        The training can be online. That is, an existing
        classifier can be updated with new training data.

        Args:
            | docs (list)       -- the documents to train on (a list of strings)
            | labels (list)     -- the labels to train on (a list of lists of strings)
        """
        docs_ = vectorize(docs)
        self.clf.partial_fit(docs_, labels)
예제 #7
0
    def digest(self):
        """
        Will process this instance's dump.
        """

        # Check if the specified file exists.
        if not exists(self.file):
            logger.info('Specified file {0} not found, fetching...'.format(
                self.file))
            self.fetch_dump()

        logger.info('Beginning digestion of pages.')

        # Process pages and collect their text content ("docs").
        docs = [self._process_page(elem) for elem in self._iterate_pages()]

        logger.info('Vectorizing the page documents...')
        # Vectorize the docs.
        doc_vecs = brain.vectorize(docs)

        # Testing
        #outfile = open('/Users/ftseng/Desktop/test.pickle', 'wb')
        #import pickle
        #pickle.dump(doc_vecs, outfile)

        # Pickle the docs to save to Mongo.
        #_doc_vecs = self.db().pickle(doc_vecs)
        #processed_name = self.url if self.url else self.file
        #self.db().add({'dump': processed_name, 'docs': _doc_vecs})
        #self.db().close()

        # Generate TF-IDF representation
        # of all docs upon completion.
        #self._generate_tfidf(docs)

        logger.info('Digestion complete!')

        if not self.silent:
            processed_name = self.url if self.url else self.file
            notify(
                'TF-IDF calculations complete for {0}!'.format(processed_name))
예제 #8
0
    def digest(self):
        """
        Will process this instance's dump.
        """

        # Check if the specified file exists.
        if not exists(self.file):
            logger.info('Specified file {0} not found, fetching...'.format(self.file))
            self.fetch_dump()

        logger.info('Beginning digestion of pages.')

        # Process pages and collect their text content ("docs").
        docs = [self._process_page(elem) for elem in self._iterate_pages()]

        logger.info('Vectorizing the page documents...')
        # Vectorize the docs.
        doc_vecs = brain.vectorize(docs)

        # Testing
        #outfile = open('/Users/ftseng/Desktop/test.pickle', 'wb')
        #import pickle
        #pickle.dump(doc_vecs, outfile)

        # Pickle the docs to save to Mongo.
        #_doc_vecs = self.db().pickle(doc_vecs)
        #processed_name = self.url if self.url else self.file
        #self.db().add({'dump': processed_name, 'docs': _doc_vecs})
        #self.db().close()

        # Generate TF-IDF representation
        # of all docs upon completion.
        #self._generate_tfidf(docs)

        logger.info('Digestion complete!')

        if not self.silent:
            processed_name = self.url if self.url else self.file
            notify('TF-IDF calculations complete for {0}!'.format(processed_name))
예제 #9
0
def multisummarize(docs, summary_length=5):
    """
    Summarize multi documents.

    Args:
        | docs (list)           -- list of documents (i.e. texts)
        | summary_length (int)  -- the preferred sentence length of the summary (default=5)

    .. note::
        The current implementation is super naive,
        thus the quality and coherence of its summaries is pretty damn terrible.
        But it's purpose for now is that there is *some* API for
        multidoc summarization.

    Returns:
        | summary (list)    -- list of sentences selected for the summary.

    .. note::
        BTW: this is super slow. takes well over a minute for 4 moderately-sized documents.
    """
    # Collect all sentences from the input documents.
    # Also collect position information about each sentence.
    sents = []
    for doc in docs:
        sents += [(sent, vectorize(sent), pos + 1)
                  for pos, sent in enumerate(sentences(doc))]
    clusters = []

    # Cluster the sentences.
    for sent in sents:
        # sent = (sent, vec, pos)

        # Keep track of the maximum scoring cluster
        # (above some minimum similarity)
        # and the avg sim score.
        min_sim = 0.01
        max_cluster = None, min_sim
        for cluster in clusters:
            avg_sim = 0
            for sent_c in cluster:
                avg_sim += (1 - cosine(sent[1], sent_c[1]))
            avg_sim = avg_sim / len(cluster)
            if avg_sim >= max_cluster[1]:
                max_cluster = cluster, avg_sim

        # If a cluster was found,
        # add the sentence to it
        if max_cluster[0]:
            max_cluster[0].append(sent)

        # Otherwise, create a new cluster.
        else:
            clusters.append([sent])

    # Rank the clusters.
    # Assuming that clusters with more sentences are more important,
    # take the top 5.
    ranked_clusters = sorted(clusters, key=lambda x: -len(x))[:summary_length]

    # For each sentence cluster, select the highest scoring sentence.
    # Again - very naive.
    ideal_length = 20
    summary_sentences = []
    for cluster in ranked_clusters:
        max_sent = None, 0
        for sent in cluster:
            avg_sim = 0
            for sent_c in cluster:
                avg_sim += 1 - cosine(sent[1], sent_c[1])
            avg_sim = avg_sim / len(cluster)
            pos = sent[2]
            length = fabs(ideal_length - len(tokenize(sent[0]))) / ideal_length

            # Score is the average similarity penalized by distance from ideal length,
            # weighted by the inverse of the position.
            score = (avg_sim - length / 2) / pos
            if score >= max_sent[1]:
                max_sent = sent[0], score
        summary_sentences.append(max_sent[0])

    return summary_sentences
예제 #10
0
파일: summarize.py 프로젝트: keho98/argos
def multisummarize(docs, summary_length=5):
    """
    Summarize multi documents.

    Args:
        | docs (list)           -- list of documents (i.e. texts)
        | summary_length (int)  -- the preferred sentence length of the summary (default=5)

    .. note::
        The current implementation is super naive,
        thus the quality and coherence of its summaries is pretty damn terrible.
        But it's purpose for now is that there is *some* API for
        multidoc summarization.

    Returns:
        | summary (list)    -- list of sentences selected for the summary.

    .. note::
        BTW: this is super slow. takes well over a minute for 4 moderately-sized documents.
    """
    # Collect all sentences from the input documents.
    # Also collect position information about each sentence.
    sents = []
    for doc in docs:
        sents += [(sent, vectorize(sent), pos + 1) for pos, sent in enumerate(sentences(doc))]
    clusters = []

    # Cluster the sentences.
    for sent in sents:
        # sent = (sent, vec, pos)

        # Keep track of the maximum scoring cluster
        # (above some minimum similarity)
        # and the avg sim score.
        min_sim = 0.01
        max_cluster = None, min_sim
        for cluster in clusters:
            avg_sim = 0
            for sent_c in cluster:
                avg_sim += (1 - cosine(sent[1], sent_c[1]))
            avg_sim = avg_sim/len(cluster)
            if avg_sim >= max_cluster[1]:
                max_cluster = cluster, avg_sim

        # If a cluster was found, 
        # add the sentence to it
        if max_cluster[0]:
            max_cluster[0].append(sent)

        # Otherwise, create a new cluster.
        else:
            clusters.append([sent])

    # Rank the clusters.
    # Assuming that clusters with more sentences are more important,
    # take the top 5.
    ranked_clusters = sorted(clusters, key=lambda x: -len(x))[:summary_length]

    # For each sentence cluster, select the highest scoring sentence.
    # Again - very naive.
    ideal_length = 20
    summary_sentences = []
    for cluster in ranked_clusters:
        max_sent = None, 0
        for sent in cluster:
            avg_sim = 0
            for sent_c in cluster:
                avg_sim += 1 - cosine(sent[1], sent_c[1])
            avg_sim = avg_sim/len(cluster)
            pos = sent[2]
            length = fabs(ideal_length - len(tokenize(sent[0])))/ideal_length

            # Score is the average similarity penalized by distance from ideal length,
            # weighted by the inverse of the position.
            score = (avg_sim - length/2)/pos
            if score >= max_sent[1]:
                max_sent = sent[0], score
        summary_sentences.append(max_sent[0])

    return summary_sentences