def test_ingester(self):
        i = DocumentIngester(self.corpus)
        s = PhraseSequencer(self.corpus)
        
        t1 = 'This document has three sentences. One of which matches. Two of which do not.'
        t2 = 'This document has only two sentences. One of which matches.'
        
        i._record_document(t1, sentence_parse(t1, s), {})
        i._record_document(t2, sentence_parse(t2, s), {})
        
        s.upload_new_phrases()
        i._upload_new_documents()
        
        c = connection.cursor()
        
        c.execute("select count(*) from documents")
        self.assertEqual(2, c.fetchone()[0])
        
        c.execute("select count(*) from phrase_occurrences")
        self.assertEqual(5, c.fetchone()[0])

        # make sure we can add on to existing data
        i = DocumentIngester(self.corpus)
        s = PhraseSequencer(self.corpus)
        
        t3 = 'This document has only two sentences. Only one of which is new.'
        p3 = sentence_parse(t3, s)
        
        doc_id = i._record_document(t3, p3, {})
        self.assertEqual(2, doc_id)
        self.assertEqual([(3, [(0, 37)]), (4, [(38, 63)])], p3)
        
        s.upload_new_phrases()
        i._upload_new_documents()
        
        c.execute("select count(*) from documents")
        self.assertEqual(3, c.fetchone()[0])
        
        c.execute("select count(*) from phrase_occurrences")
        self.assertEqual(7, c.fetchone()[0])
    def test_persistence(self):
        s1 = PhraseSequencer(self.corpus)
        a = s1.sequence('a')
        b = s1.sequence('b')
        c = s1.sequence('c')

        # new sequencer shouldn't see updates that haven't been persisted
        # note: should never do this in practice--should only ever be one
        # active sequencer per corpus.
        s2 = PhraseSequencer(self.corpus)
        self.assertEqual(0, s2.sequence('a'))
        
        s1.upload_new_phrases()
        self.assertEqual(1, s1.sequence('b')) # existing phrases still present
        self.assertEqual(3, s1.sequence('d')) # new phrases can still be added
        
        s3 = PhraseSequencer(self.corpus)
        self.assertEqual(2, s3.sequence('c')) # previously uploaded phrase appears
        self.assertEqual(3, s3.sequence('e')) # but not d=3, which wasn't uploaded
        
        s4 = PhraseSequencer(Corpus())
        self.assertEqual(0, s4.sequence('f'))  # sequencer with different corpus doesn't show at all
Ejemplo n.º 3
0
class DocumentIngester(object):
    
    def __init__(self, corpus, parser=sentence_parse, compute_similarities=True):
        """Return a new ingester for the corpus.
        
        parser may be sentence_parse or ngram_parser(n)
        
        Client must insure that no other ingester is running
        concurrently on the same corpus.
        """
        
        self.corpus = corpus
        self.parser = parser
        self.should_compute_similarities = compute_similarities
        
        max_doc_id = corpus.max_doc_id()
        self.next_id = max_doc_id + 1 if max_doc_id is not None else 0
        
        self.document_file = tempfile.TemporaryFile()
        self.document_writer = UnicodeWriter(self.document_file)

        self.occurrence_file = tempfile.TemporaryFile()
        
        self.sequencer = PhraseSequencer(corpus)
        
    
    def _record_document(self, text, phrases, metadata):
        doc_id = self.next_id
        self.next_id += 1
        
        formatted_metadata = ",".join([('"%s"=>"%s"' % (key, value.replace('\\', '\\\\').replace('"', '\\"'))) for (key, value) in metadata.items()])
        self.document_writer.writerow([str(self.corpus.id), str(doc_id), text, formatted_metadata])
        
        for (phrase_id, indexes) in phrases:
            formatted_indexes = '"{%s}"' % ", ".join(['""(%s, %s)""' % (start, end) for (start, end) in indexes])
            self.occurrence_file.write("%s,%s,%s,%s\n" % (self.corpus.id, doc_id, phrase_id, formatted_indexes))

        return doc_id 
        
        
    def _upload_new_documents(self):
        """Upload document text and phrase occurrences
        
        Return list of new document_ids
        
        """
        
        self.document_file.flush()
        self.document_file.seek(0)
        self.corpus.upload_csv(self.document_file, 'documents')
        self.document_file.close()
        self.document_file = tempfile.TemporaryFile()
        self.document_writer = csv.writer(self.document_file)

        self.occurrence_file.flush()
        self.occurrence_file.seek(0)
        self.corpus.upload_csv(self.occurrence_file, 'phrase_occurrences')
        self.occurrence_file.close()
        self.occurrence_file = tempfile.TemporaryFile()


    def ingest(self, docs):
        """Ingest set of new documents"""
        
        new_doc_ids = list()
        
        print "parsing %s documents..." % len(docs)
    
        for doc in docs:
            if isinstance(doc, basestring):
                text = doc
                metadata = {}
            else:
                text = doc['text']
                metadata = doc['metadata']

            phrases = self.parser.__call__(text, self.sequencer)
            id = self._record_document(text, phrases, metadata)
            new_doc_ids.append(id)
            
        print "uploading documents..."
        
        self.sequencer.upload_new_phrases()
        self._upload_new_documents()
        
        if self.should_compute_similarities:
            print "computing similarities..."
            self.compute_similarities(new_doc_ids)

    @staticmethod
    def _pairs_for_comparison(all_ids, new_ids):
        allowed_ids = set(all_ids)
        all_ids = list(all_ids)
        all_ids.sort()
    
        new_ids = list(new_ids)
        new_ids.sort(reverse=True)
    
        for x in all_ids:
            for y in new_ids:
                if x >= y:
                    break
                if y in allowed_ids:
                    yield (x, y)

    def compute_similarities(self, new_doc_ids=None, min_similarity=0.5):
        docs = self.corpus.all_docs()

        # new_doc_ids is used to keep from recomputing already known similarities.
        # None is special signal to compute on all doc pairs.
        if new_doc_ids is None:
            new_doc_ids = docs.keys()
    
        with get_similarity_writer(self.corpus.id) as writer:
            i = 0
            for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids):
                similarity = jaccard(docs[x], docs[y])
                if similarity >= min_similarity:
                    writer.write(x, y, similarity)
                
                i += 1
                if i % 10000000 == 0:
                    writer.flush()
                    sys.stdout.write('.')
                    sys.stdout.flush()
class DocumentIngester(object):
    def __init__(self,
                 corpus,
                 parser=sentence_parse,
                 compute_similarities=True):
        """Return a new ingester for the corpus.
        
        parser may be sentence_parse or ngram_parser(n)
        
        Client must insure that no other ingester is running
        concurrently on the same corpus.
        """

        self.corpus = corpus
        self.parser = parser
        self.should_compute_similarities = compute_similarities

        max_doc_id = corpus.max_doc_id()
        self.next_id = max_doc_id + 1 if max_doc_id is not None else 0

        self.document_file = tempfile.TemporaryFile()
        self.document_writer = UnicodeWriter(self.document_file)

        self.occurrence_file = tempfile.TemporaryFile()

        self.sequencer = PhraseSequencer(corpus)

    def _record_document(self, text, phrases, metadata):
        doc_id = self.next_id
        self.next_id += 1

        formatted_metadata = ",".join([
            ('"%s"=>"%s"' %
             (key, value.replace('\\', '\\\\').replace('"', '\\"')))
            for (key, value) in metadata.items()
        ])
        self.document_writer.writerow(
            [str(self.corpus.id),
             str(doc_id), text, formatted_metadata])

        for (phrase_id, indexes) in phrases:
            formatted_indexes = '"{%s}"' % ", ".join(
                ['""(%s, %s)""' % (start, end) for (start, end) in indexes])
            self.occurrence_file.write(
                "%s,%s,%s,%s\n" %
                (self.corpus.id, doc_id, phrase_id, formatted_indexes))

        return doc_id

    def _upload_new_documents(self):
        """Upload document text and phrase occurrences
        
        Return list of new document_ids
        
        """

        self.document_file.flush()
        self.document_file.seek(0)
        self.corpus.upload_csv(self.document_file, 'documents')
        self.document_file.close()
        self.document_file = tempfile.TemporaryFile()
        self.document_writer = csv.writer(self.document_file)

        self.occurrence_file.flush()
        self.occurrence_file.seek(0)
        self.corpus.upload_csv(self.occurrence_file, 'phrase_occurrences')
        self.occurrence_file.close()
        self.occurrence_file = tempfile.TemporaryFile()

    def ingest(self, docs):
        """Ingest set of new documents"""

        new_doc_ids = list()

        print "parsing %s documents..." % len(docs)

        for doc in docs:
            if isinstance(doc, basestring):
                text = doc
                metadata = {}
            else:
                text = doc['text']
                metadata = doc['metadata']

            phrases = self.parser.__call__(text, self.sequencer)
            id = self._record_document(text, phrases, metadata)
            new_doc_ids.append(id)

        print "uploading documents..."

        self.sequencer.upload_new_phrases()
        self._upload_new_documents()

        if self.should_compute_similarities:
            print "computing similarities..."
            self.compute_similarities(new_doc_ids)

    @staticmethod
    def _pairs_for_comparison(all_ids, new_ids):
        allowed_ids = set(all_ids)
        all_ids = list(all_ids)
        all_ids.sort()

        new_ids = list(new_ids)
        new_ids.sort(reverse=True)

        for x in all_ids:
            for y in new_ids:
                if x >= y:
                    break
                if y in allowed_ids:
                    yield (x, y)

    def compute_similarities(self, new_doc_ids=None, min_similarity=0.5):
        docs = self.corpus.all_docs()

        # new_doc_ids is used to keep from recomputing already known similarities.
        # None is special signal to compute on all doc pairs.
        if new_doc_ids is None:
            new_doc_ids = docs.keys()

        with get_similarity_writer(self.corpus.id) as writer:
            i = 0
            for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids):
                similarity = jaccard(docs[x], docs[y])
                if similarity >= min_similarity:
                    writer.write(x, y, similarity)

                i += 1
                if i % 10000000 == 0:
                    writer.flush()
                    sys.stdout.write('.')
                    sys.stdout.flush()