Python clean_dir Examples, hepcrawl.testlib.fixtures.clean_dir Python Examples

Example #1

0

Show file

File: test_pos.py Project: drjova/hepcrawl

def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'),
        )
    ).next()
    response = HtmlResponse(
        url=request.url,
        request=request,
        body=scrape_pos_conference_paper_page_body,
        **{'encoding': 'utf-8'}
    )
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response).next()
    crawl_result = pipeline.process_item(parsed_item, spider)
    assert crawl_result['record']

    yield crawl_result['record']

    clean_dir()

Example #2

0

Show file

def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        record = pipeline.process_item(item, spider)
        return record

    parsed_items = list(
        spider.parse(
            fake_response_from_file(
                'arxiv/sample_arxiv_record.xml',
                response_type=TextResponse,
            )))

    assert parsed_items
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()

Example #3

0

Show file

def generated_conference_paper(scrape_pos_conference_paper_page_body):
    """Return results generator from the PoS spider."""
    # environmental variables needed for the pipelines payload
    os.environ['SCRAPY_JOB'] = 'scrapy_job'
    os.environ['SCRAPY_FEED_URI'] = 'scrapy_feed_uri'
    os.environ['SCRAPY_LOG_FILE'] = 'scrapy_log_file'

    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file(
            file_name=str('pos/sample_pos_record.xml'), )).next()
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_conference_paper_page_body,
                            **{'encoding': 'utf-8'})
    assert response

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response).next()
    parsed_record = pipeline.process_item(parsed_item, spider)
    assert parsed_record

    yield parsed_record

    clean_dir()

Example #4

0

Show file

File: test_arxiv_single.py Project: Glignos/hepcrawl

def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_processed_item(item, spider):
        record = pipeline.process_item(item, spider)
        validate(record, 'hep')
        assert record
        return record

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record0.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_item(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()

Example #5

0

Show file

File: test_desy.py Project: iulianav/hepcrawl

def cleanup():
    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    sleep(10)
    yield

    clean_dir('/tmp/file_urls')
    clean_dir('/tmp/DESY')

Example #6

0

Show file

File: test_wsp.py Project: iulianav/hepcrawl

def remove_generated_files(package_location):
    clean_dir(path='/tmp/WSP/')

    _, dirs, files = next(os.walk(package_location))
    for dir_name in dirs:
        clean_dir(os.path.join(package_location, dir_name))
    for file_name in files:
        if not file_name.endswith('.zip'):
            os.unlink(os.path.join(package_location, file_name))

Example #7

0

Show file

File: test_desy.py Project: zanachka/hepcrawl

def cleanup():
    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    sleep(10)
    yield
    s3 = s3_connection(**S3_CONFIG)
    for bucket in s3.buckets.all():
        for key in bucket.objects.all():
            key.delete()
        bucket.delete()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))

Example #8

0

Show file

def cleanup():
    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    settings = get_settings()['CRAWLER_SETTINGS']
    settings['buckets'] = ['downloaded']
    setup_s3_buckets(**settings)
    time.sleep(10)
    yield

    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    clean_buckets(**settings)

Example #9

0

Show file

File: test_pipelines.py Project: michamos/hepcrawl

def json_spider_record(tmpdir):
    from scrapy.http import TextResponse
    spider = arxiv_spider.ArxivSpider()
    items = spider.parse(
        fake_response_from_file(
            'arxiv/sample_arxiv_record10.xml',
            response_type=TextResponse,
        ),
    )
    parsed_record = items.next()
    assert parsed_record
    yield spider, parsed_record

    clean_dir()

Example #10

0

Show file

def json_spider_record(tmpdir):
    from scrapy.http import TextResponse
    spider = arxiv_spider.ArxivSpider()
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record10.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    items = (spider.parse_record(sel) for sel in test_selectors)
    parsed_record = items.next()
    assert parsed_record
    yield spider, parsed_record

    clean_dir()

Example #11

0

Show file

def set_up_local_environment():
    package_location = get_test_suite_path(
        'cds',
        'fixtures',
        'oai_harvested',
        'cds_smoke_records.xml',
        test_suite='functional',
    )

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_file': 'file://' + package_location,
        }
    }

    clean_dir()

Example #12

0

Show file

File: test_cds.py Project: drjova/hepcrawl

def set_up_local_environment():
    package_location = get_test_suite_path(
        'cds',
        'fixtures',
        'oai_harvested',
        'cds_smoke_records.xml',
        test_suite='functional',
    )

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_file': 'file://' + package_location,
        }
    }

    clean_dir()

Example #13

0

Show file

File: test_pos.py Project: michamos/hepcrawl

def record(scrape_pos_page_body):
    """Return results generator from the PoS spider."""
    crawler = Crawler(spidercls=pos_spider.POSSpider)
    spider = pos_spider.POSSpider.from_crawler(crawler)
    request = spider.parse(
        fake_response_from_file('pos/sample_pos_record.xml')).next()
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_page_body,
                            **{'encoding': 'utf-8'})
    assert response
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)
    parsed_item = request.callback(response)
    parsed_record = pipeline.process_item(parsed_item, spider)
    assert parsed_record

    yield parsed_record

    clean_dir()

Example #14

0

Show file

File: test_arxiv.py Project: michamos/hepcrawl

def set_up_local_environment():
    package_location = get_test_suite_path(
        'arxiv',
        'fixtures',
        'oai_harvested',
        'arxiv_smoke_record.xml',
        test_suite='functional',
    )

    # The test must wait until the docker environment is up (takes about 5 seconds).
    sleep(5)

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'source_file': 'file://' + package_location,
        }
    }

    clean_dir()

Example #15

0

Show file

File: test_wsp.py Project: iulianav/hepcrawl

def set_up_ftp_environment():
    netrc_location = get_test_suite_path(
        'wsp',
        'fixtures',
        'ftp_server',
        '.netrc',
        test_suite='functional',
    )

    # The test must wait until the docker environment is up (takes about 10 seconds).
    sleep(10)

    yield {
        'CRAWLER_HOST_URL': 'http://scrapyd:6800',
        'CRAWLER_PROJECT': 'hepcrawl',
        'CRAWLER_ARGUMENTS': {
            'ftp_host': 'ftp_server',
            'ftp_netrc': netrc_location,
        }
    }

    clean_dir(path='/tmp/WSP/')

Example #16

0

Show file

def record():
    """Return results generator from the crossref spider. All fields, one record.
    """
    def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=crossref_spider.CrossrefSpider)
    spider = crossref_spider.CrossrefSpider.from_crawler(crawler, 'fakedoi')
    fake_response = fake_response_from_file(
        'crossref/sample_crossref_record.json',
        response_type=TextResponse,
    )
    parsed_items = spider.parse(fake_response)

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield _get_record_from_processed_item(parsed_items, spider)

    clean_dir()

Example #17

0

Show file

File: test_desy.py Project: katrinleinweber/hepcrawl

def cleanup():
    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    sleep(10)
    yield

    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    clean_dir('/tmp/file_urls')
    clean_dir('/tmp/DESY')

Example #18

0

Show file

File: test_desy.py Project: drjova/hepcrawl

def cleanup():
    # The test must wait until the docker environment is up (takes about 10
    # seconds).
    sleep(10)
    yield

    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    clean_dir('/tmp/file_urls')
    clean_dir('/tmp/DESY')

Example #19

0

Show file

File: test_arxiv_single.py Project: drjova/hepcrawl

def results():
    """Return results generator from the arxiv spider. All fields, one record.
    """
    def _get_record_from_processed_item(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        validate(crawl_result['record'], 'hep')
        assert crawl_result
        return crawl_result['record']

    crawler = Crawler(spidercls=arxiv_spider.ArxivSpider)
    spider = arxiv_spider.ArxivSpider.from_crawler(crawler)
    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record0.xml',
        response_type=TextResponse,
    )
    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]

    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [_get_record_from_processed_item(parsed_item, spider) for parsed_item in parsed_items]

    clean_dir()

Example #20

0

Show file

def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        record = pipeline.process_item(item, spider)
        return record

    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()

Example #21

0

Show file

File: test_arxiv_all.py Project: drjova/hepcrawl

def many_results(spider):
    """Return results generator from the arxiv spider. Tricky fields, many
    records.
    """
    def _get_processed_record(item, spider):
        crawl_result = pipeline.process_item(item, spider)
        return crawl_result['record']

    fake_response = fake_response_from_file(
        'arxiv/sample_arxiv_record.xml',
        response_type=TextResponse,
    )

    test_selectors = fake_response.xpath('.//record')
    parsed_items = [spider.parse_record(sel) for sel in test_selectors]
    pipeline = InspireCeleryPushPipeline()
    pipeline.open_spider(spider)

    yield [
        _get_processed_record(parsed_item, spider)
        for parsed_item in parsed_items
    ]

    clean_dir()

Example #22

0

Show file

def remove_generated_files(package_location):
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))

    _, dirs, files = next(os.walk(package_location))
    for dir_name in dirs:
        clean_dir(os.path.join(package_location, dir_name))
    for file_name in files:
        if not file_name.endswith('.zip'):
            os.unlink(os.path.join(package_location, file_name))

Example #23

0

Show file

File: test_wsp.py Project: drjova/hepcrawl

def remove_generated_files(package_location):
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))

    _, dirs, files = next(os.walk(package_location))
    for dir_name in dirs:
        clean_dir(os.path.join(package_location, dir_name))
    for file_name in files:
        if not file_name.endswith('.zip'):
            os.unlink(os.path.join(package_location, file_name))

Example #24

0

Show file

File: test_wsp.py Project: drjova/hepcrawl

def cleanup():
    yield

    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    clean_dir('/tmp/file_urls')
    clean_dir('/tmp/WSP')

Example #25

0

Show file

File: test_arxiv.py Project: ammirate/hepcrawl

def cleanup():
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    yield
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))

Example #26

0

Show file

File: test_world_scientific.py Project: drjova/hepcrawl

def cleanup():
    yield
    clean_dir()

Example #27

0

Show file

def cleanup():
    yield

    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    clean_dir('/tmp/file_urls')
    clean_dir('/tmp/WSP')

Example #28

0

Show file

def cleanup():
    yield
    clean_dir('/tmp/last_runs/')

Example #29

0

Show file

File: test_arxiv.py Project: drjova/hepcrawl

def cleanup():
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))
    yield
    clean_dir()
    clean_dir(path=os.path.join(os.getcwd(), '.scrapy'))

Example #30

0

Show file

File: test_oaipmh.py Project: drjova/hepcrawl

def cleanup():
    yield
    clean_dir('/tmp/last_runs/')

Example #31

0

Show file

File: test_world_scientific.py Project: zanachka/hepcrawl

def cleanup():
    yield
    clean_dir()