Beispiel #1
0
def summarize(cluster, index):
    summary = Summary()

    cluster_maker = ClusterMaker(index)
    sentences = []
    for doc in cluster:
        for sentence in doc.sentences():
            cluster_maker.add(sentence)
            sentences.append(sentence)

    cluster_maker.process_add_list()
    for i in range(5):
        cluster_maker.iterate_affinity()

    representative_sentences = {}
    for sentence in sentences:
        if not sentence.exemplar in representative_sentences:
            representative_sentences[sentence.exemplar] = True

    summary.sentences = representative_sentences.keys()
    summary.guid = cluster[0].guid
    summary.title = cluster[0].title
    summary.publish_date = cluster[0].publish_date

    return summary
Beispiel #2
0
 def __init__(self):
     self.documents = []
     self.index = InvertedIndex()
     self.clustering = ClusterMaker(self.index)
     self.lock = threading.Lock()
Beispiel #3
0
class DocumentRepository(object):
    """
    Store for news documents
    """

    def __init__(self):
        self.documents = []
        self.index = InvertedIndex()
        self.clustering = ClusterMaker(self.index)
        self.lock = threading.Lock()

    def add(self, document_list):
        """
        Add a list of documents to the repository
        """
        self.lock.acquire()
        for doc in document_list:
            heapq.heappush(self.documents, doc)
            self.index.add(doc)
            self.clustering.add(doc)
        self.lock.release()

    def recent_documents(self, count=10):
        """
        Retrieve the count most recent documents
        """
        recent_docs = heapq.nlargest(count, self.documents)
        return recent_docs

    def recent_clusters(self, count=30):
        """
        Retrieve the count most recent documents
        """
        recent_docs = heapq.nlargest(count*10, self.documents)
        clusters = {}
        for doc in recent_docs:
            if doc.exemplar:
                representative = doc.exemplar
            else:
                representative = doc
            cluster = clusters.get(representative, [])
            cluster.append(doc)
            clusters[representative] = cluster

        clusters = [(len(value), value) for value in clusters.values()]
        clusters.sort(reverse=True)
        clusters = clusters[:count]
        summaries = []
        for cluster in clusters:
            summaries.append(summarization.summarize(cluster[1], self.index))
        return summaries

    def search(self, query, count=10):
        """
        Retrieve count documents matching the query
        """
        keywords = list(re.findall(r"[\w']+", query))
        results = self.index.search(keywords, count)
        return results

    def search_clusters(self, query, count=10):
        """
        Retrieve count clusters matching the query
        """
        keywords = list(re.findall(r"[\w']+", query))
        retrieved_docs = self.index.search(keywords, count)
        clusters = {}
        for doc in retrieved_docs:
            if doc.exemplar:
                representative = doc.exemplar
            else:
                representative = doc
            cluster = clusters.get(representative, [])
            cluster.append(doc)
            clusters[representative] = cluster

        clusters = [(len(value), value) for value in clusters.values()]
        clusters.sort(reverse=True)
        clusters = clusters[:count]
        return [value for (x, value) in clusters]

    def search_guid(self, guid):
        """
        Retrieve count clusters matching the query
        """

        return self.index.get(guid)

    def rebuild(self):
        logging.info("Rebuilding index")
        self.index.clear()
        self.clustering.clear()
        for doc in self.documents:
            doc.exemplar = doc
            doc.children = []
            self.index.add(doc)
            self.clustering.add(doc)

    def stats(self):
        stats = {
            "Number of indexed documents": self.index.n_documents,
            "Number of indexed words": self.index.n_words,
            "Number of clustered documents": len(self.clustering.objects)
        }

        return stats