def handle(self, *args, **options): s3 = boto3.resource('s3', aws_access_key_id=settings.AWS_ACCESS_KEY, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) dt = options['start_date'].replace(hour=0, minute=0, second=0, microsecond=0) if not options['to_stdout']: connections.create_connection(hosts=[options['es_url']], **settings.ES_CONNECTION_PARAMS) CRECDoc.init() while dt < options['end_date']: logger.info('Processing files for {0}.'.format(dt)) try: response = s3.Object( options['source_bucket'], crec_s3_key('mods.xml', dt) ).get() except botocore.exceptions.ClientError as e: logger.info('Could not find mods file for {0}.'.format(dt)) response = None if response is not None and response.get('Body'): try: crecs = extract_crecs_from_mods(response['Body']) logger.info('Found {0} new records.'.format(len(crecs))) if options['to_stdout']: logger.info('Using stdout:') for crec in crecs: if not crec.is_skippable(): if options['to_stdout']: logger.info(crec.to_es_doc()) else: es_doc = crec.to_es_doc() es_doc.save() upload_speaker_word_counts(crec) except Exception as e: logger.exception('Error processing data for {0}.'.format(dt.strftime('%Y-%m-%d'))) dt += timedelta(days=1)
def setUp(self): self.es_conn = connections.get_connection() self.test_crecs = [] for i in range(20): self.test_crecs.append( CRECDoc(title=str(i), content='foo bar baz Foo', date_issued=datetime(2017, 1, i % 5 + 1))) self.index = Index(settings.ES_CW_INDEX) CRECDoc.init() for c in self.test_crecs: c.save(refresh=True) self.client = Client()
def get_text_search_results(start_date, end_date, terms, size=10, offset=0): """Runs a "match query against any provided field in the terms argument. Returns a list of docs as dicts including the search score. Args: start_date (datetime): Start of date range. end_date (datetime): End of date range. terms (dict): A dict mapping field name to search term, multiple fields are or'd together. size (int): The number of results to retrieve, defaults to 10. offset (int): The offset from the highest search result to return items from (for pagination). Returns: list: A list of CREC documents as dicts, reverse sorted by score. """ search = CRECDoc.search() for field, search_term in terms.items(): m = Match(**{field: {'query': search_term, 'type': 'phrase'}}) search = search.query(m) search = search.filter( 'range', date_issued={'gte': start_date, 'lte': end_date} ) search = search.sort('_score') search = search[offset:offset+size] results = search.execute() data = [] for r in results: d = r.to_dict() d['date_issued'] = r.date_issued.strftime('%Y-%m-%d') d['score'] = r.meta.score data.append(d) data.sort(key=lambda x: -x['score']) return data
def setUp(self): self.es_conn = connections.get_connection() self.test_crecs = [] for i in range(20): self.test_crecs.append( CRECDoc( title=str(i), content='foo bar baz Foo', date_issued=datetime(2017, 1, i % 5 + 1) ) ) self.index = Index(settings.ES_CW_INDEX) CRECDoc.init() for c in self.test_crecs: c.save(refresh=True) self.client = Client()
def test_search_by_content(self): c = CRECDoc(title='foo', content='blah', date_issued=datetime(2017, 1, 1)) c.save(refresh=True) start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 30) query_args = { 'start_date': start_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d'), 'content': 'blah', } response = self.client.get('/cwapi/search/', query_args) response_content = response.json() results = response_content['data'] self.assertEquals(1, len(results)) self.assertEquals('foo', results[0]['title']) self.assertEquals('blah', results[0]['content'])
def test_search_by_content(self): c = CRECDoc( title='foo', content='blah', date_issued=datetime(2017, 1, 1) ) c.save(refresh=True) start_date = datetime(2017, 1, 1) end_date = datetime(2017, 1, 30) query_args = { 'start_date': start_date.strftime('%Y-%m-%d'), 'end_date': end_date.strftime('%Y-%m-%d'), 'content': 'blah', } response = self.client.get('/cwapi/search/', query_args) response_content = response.json() results = response_content['data'] self.assertEquals(1, len(results)) self.assertEquals('foo', results[0]['title']) self.assertEquals('blah', results[0]['content'])
def handle(self, *args, **options): s3 = boto3.resource( 's3', aws_access_key_id=settings.AWS_ACCESS_KEY, aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY) dt = options['start_date'].replace(hour=0, minute=0, second=0, microsecond=0) if not options['to_stdout']: connections.create_connection(hosts=[options['es_url']], **settings.ES_CONNECTION_PARAMS) CRECDoc.init() while dt < options['end_date']: logger.info('Processing files for {0}.'.format(dt)) try: response = s3.Object(options['source_bucket'], crec_s3_key('mods.xml', dt)).get() except botocore.exceptions.ClientError as e: logger.info('Could not find mods file for {0}.'.format(dt)) response = None if response is not None and response.get('Body'): try: crecs = extract_crecs_from_mods(response['Body']) logger.info('Found {0} new records.'.format(len(crecs))) if options['to_stdout']: logger.info('Using stdout:') for crec in crecs: if not crec.is_skippable(): if options['to_stdout']: logger.info(crec.to_es_doc()) else: es_doc = crec.to_es_doc() es_doc.save() upload_speaker_word_counts(crec) except Exception as e: logger.exception('Error processing data for {0}.'.format( dt.strftime('%Y-%m-%d'))) dt += timedelta(days=1)
def to_es_doc(self): """Returns the CRECParser as a dict ready to be uploaded to elasticsearch. Returns: dict: A dict representation of this document. """ return CRECDoc( title=self.title, title_part=self.title_part, date_issued=self.date_issued, content=self.content, crec_id=self.id, pdf_url=self.pdf_url, html_url=self.html_url, page_start=self.page_start, page_end=self.page_end, speakers=','.join(self.speakers), segments=self.segments, )
def get_text_search_results(start_date, end_date, terms, size=10, offset=0): """Runs a "match query against any provided field in the terms argument. Returns a list of docs as dicts including the search score. Args: start_date (datetime): Start of date range. end_date (datetime): End of date range. terms (dict): A dict mapping field name to search term, multiple fields are or'd together. size (int): The number of results to retrieve, defaults to 10. offset (int): The offset from the highest search result to return items from (for pagination). Returns: list: A list of CREC documents as dicts, reverse sorted by score. """ search = CRECDoc.search() for field, search_term in terms.items(): m = Match(**{field: {'query': search_term, 'type': 'phrase'}}) search = search.query(m) search = search.filter('range', date_issued={ 'gte': start_date, 'lte': end_date }) search = search.sort('_score') search = search[offset:offset + size] results = search.execute() data = [] for r in results: d = r.to_dict() d['date_issued'] = r.date_issued.strftime('%Y-%m-%d') d['score'] = r.meta.score data.append(d) data.sort(key=lambda x: -x['score']) return data