def __init__(self, corpus, parser=sentence_parse, compute_similarities=True): """Return a new ingester for the corpus. parser may be sentence_parse or ngram_parser(n) Client must insure that no other ingester is running concurrently on the same corpus. """ self.corpus = corpus self.parser = parser self.should_compute_similarities = compute_similarities max_doc_id = corpus.max_doc_id() self.next_id = max_doc_id + 1 if max_doc_id is not None else 0 self.document_file = tempfile.TemporaryFile() self.document_writer = UnicodeWriter(self.document_file) self.occurrence_file = tempfile.TemporaryFile() self.sequencer = PhraseSequencer(corpus)
def test_empty(self): s = PhraseSequencer(self.corpus) c = connection.cursor() p = sentence_parse('', s) self.assertEqual([], p) c.execute('select count(*) from phrases') self.assertEqual(0, c.fetchone()[0])
def test_basic(self): t1 = "This is a basic text. Two sentences. Maybe three?" t2 = "Two sentences. Maybe...three? this is a basic text." s = PhraseSequencer(self.corpus) p1 = sentence_parse(t1, s) p2 = sentence_parse(t2, s) self.assertEqual([(0, [(0, 21)]), (1, [(22, 36)]), (2, [(37, 49)])], p1) self.assertEqual([(0, [(30, 51)]), (1, [(0, 14)]), (2, [(15, 29)])], p2)
def test_sentence_parse(self): s = PhraseSequencer(self.corpus) t = '' self.assertEqual([], sentence_parse(t, s)) t = ' ' self.assertEqual([], sentence_parse(t, s)) t = 'A simple test case. Of two sentences.' self.assertEqual([(0, [(0, 19)]), (1, [(20, 37)])], sentence_parse(t, s)) t = ' \n A simple test case. \t \t \n Of two sentences.\n' self.assertEqual([(0, [(3, 22)]), (1, [(29, 46)])], sentence_parse(t, s)) t = 'of two sentences. of two sentences?' self.assertEqual([(1, [(0, 17), (18, 35)])], sentence_parse(t, s))
def test_basic(self): s = PhraseSequencer(self.corpus) a = s.sequence('a') b = s.sequence('b') c = s.sequence('c') self.assertEqual(0, a) self.assertEqual(1, b) self.assertEqual(2, c) self.assertEqual(a, s.sequence('a')) self.assertEqual(b, s.sequence('b')) self.assertEqual(c, s.sequence('c'))
def test_ingester(self): i = DocumentIngester(self.corpus) s = PhraseSequencer(self.corpus) t1 = 'This document has three sentences. One of which matches. Two of which do not.' t2 = 'This document has only two sentences. One of which matches.' i._record_document(t1, sentence_parse(t1, s), {}) i._record_document(t2, sentence_parse(t2, s), {}) s.upload_new_phrases() i._upload_new_documents() c = connection.cursor() c.execute("select count(*) from documents") self.assertEqual(2, c.fetchone()[0]) c.execute("select count(*) from phrase_occurrences") self.assertEqual(5, c.fetchone()[0]) # make sure we can add on to existing data i = DocumentIngester(self.corpus) s = PhraseSequencer(self.corpus) t3 = 'This document has only two sentences. Only one of which is new.' p3 = sentence_parse(t3, s) doc_id = i._record_document(t3, p3, {}) self.assertEqual(2, doc_id) self.assertEqual([(3, [(0, 37)]), (4, [(38, 63)])], p3) s.upload_new_phrases() i._upload_new_documents() c.execute("select count(*) from documents") self.assertEqual(3, c.fetchone()[0]) c.execute("select count(*) from phrase_occurrences") self.assertEqual(7, c.fetchone()[0])
def test_persistence(self): s1 = PhraseSequencer(self.corpus) a = s1.sequence('a') b = s1.sequence('b') c = s1.sequence('c') # new sequencer shouldn't see updates that haven't been persisted # note: should never do this in practice--should only ever be one # active sequencer per corpus. s2 = PhraseSequencer(self.corpus) self.assertEqual(0, s2.sequence('a')) s1.upload_new_phrases() self.assertEqual(1, s1.sequence('b')) # existing phrases still present self.assertEqual(3, s1.sequence('d')) # new phrases can still be added s3 = PhraseSequencer(self.corpus) self.assertEqual(2, s3.sequence('c')) # previously uploaded phrase appears self.assertEqual(3, s3.sequence('e')) # but not d=3, which wasn't uploaded s4 = PhraseSequencer(Corpus()) self.assertEqual(0, s4.sequence('f')) # sequencer with different corpus doesn't show at all
class DocumentIngester(object): def __init__(self, corpus, parser=sentence_parse, compute_similarities=True): """Return a new ingester for the corpus. parser may be sentence_parse or ngram_parser(n) Client must insure that no other ingester is running concurrently on the same corpus. """ self.corpus = corpus self.parser = parser self.should_compute_similarities = compute_similarities max_doc_id = corpus.max_doc_id() self.next_id = max_doc_id + 1 if max_doc_id is not None else 0 self.document_file = tempfile.TemporaryFile() self.document_writer = UnicodeWriter(self.document_file) self.occurrence_file = tempfile.TemporaryFile() self.sequencer = PhraseSequencer(corpus) def _record_document(self, text, phrases, metadata): doc_id = self.next_id self.next_id += 1 formatted_metadata = ",".join([('"%s"=>"%s"' % (key, value.replace('\\', '\\\\').replace('"', '\\"'))) for (key, value) in metadata.items()]) self.document_writer.writerow([str(self.corpus.id), str(doc_id), text, formatted_metadata]) for (phrase_id, indexes) in phrases: formatted_indexes = '"{%s}"' % ", ".join(['""(%s, %s)""' % (start, end) for (start, end) in indexes]) self.occurrence_file.write("%s,%s,%s,%s\n" % (self.corpus.id, doc_id, phrase_id, formatted_indexes)) return doc_id def _upload_new_documents(self): """Upload document text and phrase occurrences Return list of new document_ids """ self.document_file.flush() self.document_file.seek(0) self.corpus.upload_csv(self.document_file, 'documents') self.document_file.close() self.document_file = tempfile.TemporaryFile() self.document_writer = csv.writer(self.document_file) self.occurrence_file.flush() self.occurrence_file.seek(0) self.corpus.upload_csv(self.occurrence_file, 'phrase_occurrences') self.occurrence_file.close() self.occurrence_file = tempfile.TemporaryFile() def ingest(self, docs): """Ingest set of new documents""" new_doc_ids = list() print "parsing %s documents..." % len(docs) for doc in docs: if isinstance(doc, basestring): text = doc metadata = {} else: text = doc['text'] metadata = doc['metadata'] phrases = self.parser.__call__(text, self.sequencer) id = self._record_document(text, phrases, metadata) new_doc_ids.append(id) print "uploading documents..." self.sequencer.upload_new_phrases() self._upload_new_documents() if self.should_compute_similarities: print "computing similarities..." self.compute_similarities(new_doc_ids) @staticmethod def _pairs_for_comparison(all_ids, new_ids): allowed_ids = set(all_ids) all_ids = list(all_ids) all_ids.sort() new_ids = list(new_ids) new_ids.sort(reverse=True) for x in all_ids: for y in new_ids: if x >= y: break if y in allowed_ids: yield (x, y) def compute_similarities(self, new_doc_ids=None, min_similarity=0.5): docs = self.corpus.all_docs() # new_doc_ids is used to keep from recomputing already known similarities. # None is special signal to compute on all doc pairs. if new_doc_ids is None: new_doc_ids = docs.keys() with get_similarity_writer(self.corpus.id) as writer: i = 0 for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids): similarity = jaccard(docs[x], docs[y]) if similarity >= min_similarity: writer.write(x, y, similarity) i += 1 if i % 10000000 == 0: writer.flush() sys.stdout.write('.') sys.stdout.flush()
class DocumentIngester(object): def __init__(self, corpus, parser=sentence_parse, compute_similarities=True): """Return a new ingester for the corpus. parser may be sentence_parse or ngram_parser(n) Client must insure that no other ingester is running concurrently on the same corpus. """ self.corpus = corpus self.parser = parser self.should_compute_similarities = compute_similarities max_doc_id = corpus.max_doc_id() self.next_id = max_doc_id + 1 if max_doc_id is not None else 0 self.document_file = tempfile.TemporaryFile() self.document_writer = UnicodeWriter(self.document_file) self.occurrence_file = tempfile.TemporaryFile() self.sequencer = PhraseSequencer(corpus) def _record_document(self, text, phrases, metadata): doc_id = self.next_id self.next_id += 1 formatted_metadata = ",".join([ ('"%s"=>"%s"' % (key, value.replace('\\', '\\\\').replace('"', '\\"'))) for (key, value) in metadata.items() ]) self.document_writer.writerow( [str(self.corpus.id), str(doc_id), text, formatted_metadata]) for (phrase_id, indexes) in phrases: formatted_indexes = '"{%s}"' % ", ".join( ['""(%s, %s)""' % (start, end) for (start, end) in indexes]) self.occurrence_file.write( "%s,%s,%s,%s\n" % (self.corpus.id, doc_id, phrase_id, formatted_indexes)) return doc_id def _upload_new_documents(self): """Upload document text and phrase occurrences Return list of new document_ids """ self.document_file.flush() self.document_file.seek(0) self.corpus.upload_csv(self.document_file, 'documents') self.document_file.close() self.document_file = tempfile.TemporaryFile() self.document_writer = csv.writer(self.document_file) self.occurrence_file.flush() self.occurrence_file.seek(0) self.corpus.upload_csv(self.occurrence_file, 'phrase_occurrences') self.occurrence_file.close() self.occurrence_file = tempfile.TemporaryFile() def ingest(self, docs): """Ingest set of new documents""" new_doc_ids = list() print "parsing %s documents..." % len(docs) for doc in docs: if isinstance(doc, basestring): text = doc metadata = {} else: text = doc['text'] metadata = doc['metadata'] phrases = self.parser.__call__(text, self.sequencer) id = self._record_document(text, phrases, metadata) new_doc_ids.append(id) print "uploading documents..." self.sequencer.upload_new_phrases() self._upload_new_documents() if self.should_compute_similarities: print "computing similarities..." self.compute_similarities(new_doc_ids) @staticmethod def _pairs_for_comparison(all_ids, new_ids): allowed_ids = set(all_ids) all_ids = list(all_ids) all_ids.sort() new_ids = list(new_ids) new_ids.sort(reverse=True) for x in all_ids: for y in new_ids: if x >= y: break if y in allowed_ids: yield (x, y) def compute_similarities(self, new_doc_ids=None, min_similarity=0.5): docs = self.corpus.all_docs() # new_doc_ids is used to keep from recomputing already known similarities. # None is special signal to compute on all doc pairs. if new_doc_ids is None: new_doc_ids = docs.keys() with get_similarity_writer(self.corpus.id) as writer: i = 0 for (x, y) in self._pairs_for_comparison(docs.keys(), new_doc_ids): similarity = jaccard(docs[x], docs[y]) if similarity >= min_similarity: writer.write(x, y, similarity) i += 1 if i % 10000000 == 0: writer.flush() sys.stdout.write('.') sys.stdout.flush()