def doc_to_fields(doc_id, radius=100): """ Search for field / department codes in a document. Args: doc_id (int) radius (int) """ doc_text = Document_Text.get(Document_Text.document==doc_id) # Search for each field. for subfield in Subfield.select(): match = subfield.search(doc_text.text) # If found, link field -> doc. if match: # Slice out the snippet. i1 = max(match.start() - radius, 0) i2 = min(match.end() + radius, len(doc_text.text)) snippet = doc_text.text[i1:i2] Subfield_Document.create( subfield=subfield, document=doc_text.document, offset=match.start(), snippet=crunch(snippet), )
def syllabus_refinement(in_file, out_file, r, threshold): """ Select the N documents around a given threshold in the syllabus / not-syllabus classifier predictions. """ cols = ['path', 'score'] reader = csv.DictReader(in_file, cols) # Gather ordered (path, score) tuples. scores = [(r['path'], float(r['score'])) for r in reader] scores = sorted(scores, key=lambda x: x[1], reverse=True) # Get the index of the document at the threshold. center = min(range(len(scores)), key=lambda x: abs(scores[x][1] - threshold)) # CSV writer. cols = ['id', 'title', 'text'] writer = csv.DictWriter(out_file, cols) writer.writeheader() for path, score in scores[center - r:center + r]: row = (Document_Text.select(Document_Text.text, Document.path).join(Document).where( Document.path == path).naive().first()) writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})
def doc_to_fields(doc_id, radius=100): """ Search for field / department codes in a document. Args: doc_id (int) radius (int) """ doc_text = Document_Text.get(Document_Text.document == doc_id) # Search for each field. for subfield in Subfield.select(): match = subfield.search(doc_text.text) # If found, link field -> doc. if match: # Slice out the snippet. i1 = max(match.start() - radius, 0) i2 = min(match.end() + radius, len(doc_text.text)) snippet = doc_text.text[i1:i2] Subfield_Document.create( subfield=subfield, document=doc_text.document, offset=match.start(), snippet=crunch(snippet), )
def test_text_extraction_fails(mock_osp): """ If no text can be extracted, don't write the row. """ # Add an empty file. path = mock_osp.add_file(content='') document = Document.create(path=path) ext_text(document.id) # Shouldn't write a row. assert Document_Text.select().count() == 0
def es_stream_docs(cls): """ Index document texts. Yields: dict: The next document. """ for row in query_bar(Document_Text.select()): yield dict( _id=row.document_id, body=row.text, )
def es_stream_docs(cls): """ Index document texts. Yields: dict: The next document. """ for row in query_bar(Document_Text.select()): yield dict( _id = row.document_id, body = row.text, )
def test_text_extraction_succeeds(mock_osp): """ read_text() should extract text for a document and write the result into the `document_text` table. """ # Add a file, create a document row. path = mock_osp.add_file(content='text') document = Document.create(path=path) ext_text(document.id) # Pop out the new row. row = Document_Text.get(Document_Text.document == document) assert row.text == 'text'
def term_counts(out_file, n): """ Write word frequency counts for N docs. """ # CSV writer. cols = ['term', 'count'] writer = csv.DictWriter(out_file, cols) writer.writeheader() # Pull counts. counts = Document_Text.term_counts(n) for term, count in counts.most_common(): writer.writerow({'term': term, 'count': count})
def test_text_extraction_succeeds(mock_osp): """ read_text() should extract text for a document and write the result into the `document_text` table. """ # Add a file, create a document row. path = mock_osp.add_file(content='text') document = Document.create(path=path) ext_text(document.id) # Pop out the new row. row = Document_Text.get(Document_Text.document==document) assert row.text == 'text'
def ext_text(doc_id): """ Write the document as plain text. Args: doc_id (int): The document id. """ doc = Document.get(Document.id==doc_id) if doc.syllabus.text: return Document_Text.create( text=doc.syllabus.text, document=doc )
def random(out_file, n): """ Write a CSV with plaintext for N random docs. """ # CSV writer. cols = ['id', 'title', 'text'] writer = csv.DictWriter(out_file, cols) writer.writeheader() query = (Document_Text.select(Document_Text.text, Document.path).join(Document).order_by( fn.random()).limit(n)) for row in query_bar(query): writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})
def term_counts(out_file, n): """ Write word frequency counts for N docs. """ # CSV writer. cols = ['term', 'count'] writer = csv.DictWriter(out_file, cols) writer.writeheader() # Pull counts. counts = Document_Text.term_counts(n) for term, count in counts.most_common(): writer.writerow({ 'term': term, 'count': count })
def truncated(out_file, frag_len): """ Write a CSV with truncated document texts. """ # CSV writer. cols = ['id', 'title', 'text'] writer = csv.DictWriter(out_file, cols) writer.writeheader() query = (Document_Text.select(Document_Text.text, Document.path).join(Document)) for row in query_bar(query): # Truncate the text. fragment = row.text[:frag_len] writer.writerow({'id': row.path, 'title': row.path, 'text': fragment})