Ejemplo n.º 1
0
'''
legisletters: collect, archive, and make searchable legislators' letters
'''

from legisletters.constants import ES_INDEX_NAME, ES_RAW_DOC_TYPE
from legisletters.utils import get_index, get_logger, add_raw_doc
import traceback

#from legisletters.scraper import download_url

LOGGER = get_logger(__name__)

if __name__ == '__main__':

    ES = get_index(ES_INDEX_NAME, LOGGER)

    QUERY = {"fields": ["url"], "filter": {"missing": {"field": "pdf"}}}

    OFFSET = 0
    QUERY_SIZE = 100
    while True:
        LOGGER.info('%s docs in raw_letter, offset %s',
                    ES.count('legisletters', ES_RAW_DOC_TYPE), OFFSET)
        DOCS = ES.search(
            index='legisletters',  # pylint: disable=unexpected-keyword-arg
            size=QUERY_SIZE,
            doc_type=ES_RAW_DOC_TYPE,
            body=QUERY,
            from_=OFFSET)['hits']['hits']

        if len(DOCS) == 0:
Ejemplo n.º 2
0
'''
legisletters: collect, archive, and make searchable legislators' letters
'''

from legisletters.constants import ES_INDEX_NAME, ES_RAW_LETTER_DOC_TYPE
from legisletters.utils import get_index, get_logger

from legisletters.scraper import download_url

LOGGER = get_logger(__name__)

if __name__ == '__main__':

    ES = get_index(ES_INDEX_NAME, LOGGER)

    QUERY = {
        "filter": {
            "missing": {
                "field": "recipients"
            }
        }
    }

    for doc in ES.search(size=100, doc_type=ES_LETTER_DOC_TYPE, body=QUERY)['hits']['hits']:  # pylint: disable=unexpected-keyword-arg
        print doc['_source']['url']
        download_url(doc['_source']['url'], ES)