def syllabus_refinement(in_file, out_file, r, threshold): """ Select the N documents around a given threshold in the syllabus / not-syllabus classifier predictions. """ cols = ['path', 'score'] reader = csv.DictReader(in_file, cols) # Gather ordered (path, score) tuples. scores = [(r['path'], float(r['score'])) for r in reader] scores = sorted(scores, key=lambda x: x[1], reverse=True) # Get the index of the document at the threshold. center = min(range(len(scores)), key=lambda x: abs(scores[x][1] - threshold)) # CSV writer. cols = ['id', 'title', 'text'] writer = csv.DictWriter(out_file, cols) writer.writeheader() for path, score in scores[center - r:center + r]: row = (Document_Text.select(Document_Text.text, Document.path).join(Document).where( Document.path == path).naive().first()) writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})
def test_text_extraction_fails(mock_osp): """ If no text can be extracted, don't write the row. """ # Add an empty file. path = mock_osp.add_file(content='') document = Document.create(path=path) ext_text(document.id) # Shouldn't write a row. assert Document_Text.select().count() == 0
def es_stream_docs(cls): """ Index document texts. Yields: dict: The next document. """ for row in query_bar(Document_Text.select()): yield dict( _id=row.document_id, body=row.text, )
def es_stream_docs(cls): """ Index document texts. Yields: dict: The next document. """ for row in query_bar(Document_Text.select()): yield dict( _id = row.document_id, body = row.text, )
def random(out_file, n): """ Write a CSV with plaintext for N random docs. """ # CSV writer. cols = ['id', 'title', 'text'] writer = csv.DictWriter(out_file, cols) writer.writeheader() query = (Document_Text.select(Document_Text.text, Document.path).join(Document).order_by( fn.random()).limit(n)) for row in query_bar(query): writer.writerow({'id': row.path, 'title': row.path, 'text': row.text})
def truncated(out_file, frag_len): """ Write a CSV with truncated document texts. """ # CSV writer. cols = ['id', 'title', 'text'] writer = csv.DictWriter(out_file, cols) writer.writeheader() query = (Document_Text.select(Document_Text.text, Document.path).join(Document)) for row in query_bar(query): # Truncate the text. fragment = row.text[:frag_len] writer.writerow({'id': row.path, 'title': row.path, 'text': fragment})