Python EDPSpiderの例、hepcrawl.spiders.edp_spider.EDPSpider Pythonの例

コード例 #1

0

ファイルを表示

def test_no_dois_jats():
    """Test parsing when no DOI in record. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <article-id pub-id-type="publisher-id">aa14485-10</article-id><article-id pub-id-type="other">2010A%26A...516A..97N</article-id>
                <title-group>
                    <article-title xml:lang="en">Dielectronic recombination of argon-like ions</article-title>
                </title-group>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert "dois" not in record
    assert isinstance(record, HEPRecord)

コード例 #2

0

ファイルを表示

def record_references_only():
    """Parse references."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0">
        <back>
        <ref-list>
        <title>References</title>
            <ref id="R5"><label>5.</label><mixed-citation publication-type="journal" id="a"><string-name><given-names>R.V.</given-names> <surname>Krishnan</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Panneerselvam</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Manikandan</surname></string-name> <string-name><given-names>M.P.</given-names> <surname>Antony</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Nagarajan</surname></string-name>, <source>J. Nucl. Radiochem. Sci.</source>, <volume>10</volume>.<issue>1</issue>, <fpage>19</fpage>–<lpage>26</lpage> (<year>2009</year>).</mixed-citation></ref>

            <ref id="R44"><label>44.</label><mixed-citation publication-type="journal"><string-name><given-names>L.</given-names> <surname>Cronin</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Sojka</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Lefebvre</surname></string-name>, <source>SAE Technical Paper</source>, DOI: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.4271/852086">10.4271/852086</ext-link>, (<year>1985</year>)</mixed-citation></ref>

            <ref id="R3"><label>3.</label><mixed-citation publication-type="book"><string-name><given-names>T.</given-names> <surname>Aliyev</surname></string-name>, <string-name><given-names>Т.</given-names> <surname>Belyaev</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Gallagher</surname></string-name> <article-title>Simulation in ANSYS flow to the gas purification section of the multicomponent gas mixture through the dust cyclone CKBN GP-628</article-title>. <source>Mechanical engineering</source>, <publisher-loc>Moscow</publisher-loc>, №<issue>10</issue>, (<year>2014</year>).</mixed-citation></ref>

        </ref-list>
        </back>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record

コード例 #3

0

ファイルを表示

def test_addendum_jats():
    """Test parsing when article type is addendum. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="addendum" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <article-id pub-id-type="publisher-id">aa14485-10</article-id><article-id pub-id-type="other">2010A%26A...516A..97N</article-id>
                <title-group>
                    <article-title xml:lang="en">Dielectronic recombination of argon-like ions</article-title>
                </title-group>
                <related-article ext-link-type="doi" href="10.1051/0004-6361/201014485">
                </related-article>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert "related_article_doi" in record
    assert record["related_article_doi"][0][
        "value"] == "10.1051/0004-6361/201014485"

コード例 #4

0

ファイルを表示

def test_aff_with_email():
    """Test popping email from the affiliation string. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Gorczyca</surname>
                        <given-names>T. W.</given-names>
                    </name>
                    <xref ref-type="aff" rid="AFF1">1</xref>
                </contrib>
                <aff id="AFF1">
                    <label>1</label>
                    <addr-line>Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA e-mail: [email protected]
                    </addr-line>
                </aff>
            <contrib-group>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]
    record = spider.parse_node(response, node)

    affiliation = "Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA"
    assert 'affiliations' in record['authors'][0]
    assert record['authors'][0]['affiliations'][0]['value'] == affiliation
    assert "e-mail" not in record['authors'][0]['affiliations'][0]['value']
    assert record['authors'][0]['email'] is None

コード例 #5

0

ファイルを表示

def test_handle_package_ftp(tarbzfile):
    """Test getting the target folder name for xml files."""
    spider = edp_spider.EDPSpider()
    response = fake_response_from_string(text=tarbzfile)
    request = next(spider.handle_package_ftp(response))

    assert isinstance(request, Request)
    assert request.meta["source_folder"] == tarbzfile

コード例 #6

0

ファイルを表示

def test_no_valid_article():
    """Test parsing when filtering out non-interesting article types."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="lecture" xml:lang="en" dtd-version="3.0">
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]
    record = spider.parse_node(response, node)

    assert record is None

コード例 #7

0

ファイルを表示

def record_rich(package_rich):
    """Return results from the EDP spider with 'rich' format.

    This is not an open access journal, so no splash scraping.
    """
    spider = edp_spider.EDPSpider()
    xml_path = package_rich.url.replace("file://", "")
    fake_resp = fake_response_from_file(xml_path)
    fake_resp.meta["rich"] = True
    node = get_node(spider, "//EDPSArticle", fake_resp)[0]

    return spider.parse_node(fake_resp, node)

コード例 #8

0

ファイルを表示

def test_collections_review():
    """Test collections when doctype is review. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="review-article" xml:lang="en" dtd-version="3.0">
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]
    record = spider.parse_node(response, node)

    assert "collections" in record
    assert record["collections"] == [{'primary': 'HEP'}, {'primary': 'Review'}]

コード例 #9

0

ファイルを表示

def test_tarfile(tarbzfile, tmpdir):
    """Test untarring a tar.bz package with a test XML file.

    Also test directory structure flattening.
    """
    spider = edp_spider.EDPSpider()
    xml_files = spider.untar_files(tarbzfile, six.text_type(tmpdir))
    xml_files_flat = spider.untar_files(
        tarbzfile, six.text_type(tmpdir), flatten=True)

    assert len(xml_files) == 1
    assert "aas/xml_rich/2000/01/ds1691.xml" in xml_files[0]
    assert "ds1691.xml" in xml_files_flat[0]
    assert "aas/xml_rich/2000/01" not in xml_files_flat[0]

コード例 #10

0

ファイルを表示

def record_jats(package_jats, scrape_pos_page_body):
    """Return results from the EDP spider with JATS format.

    This is an open access journal, so we can scrape the splash page.
    """
    spider = edp_spider.EDPSpider()
    xml_path = package_jats.url.replace("file://", "")
    fake_resp = fake_response_from_file(xml_path)
    node = get_node(spider, "//article", fake_resp)[0]
    request = spider.parse_node(fake_resp, node)
    response = HtmlResponse(url=request.url,
                            request=request,
                            body=scrape_pos_page_body,
                            **{'encoding': 'utf-8'})
    return request.callback(response)

コード例 #11

0

ファイルを表示

def test_no_dois_rich():
    """Test parsing when no DOI in record. 'Rich' format."""
    spider = edp_spider.EDPSpider()
    body = """
    <EDPSArticle>
        <ArticleID Type="Article">
            <EDPSRef>ds1691</EDPSRef>
        </ArticleID>
    </EDPSArticle>
    """
    response = fake_response_from_string(body)
    response.meta["rich"] = True
    node = get_node(spider, "//EDPSArticle", response)[0]
    record = spider.parse_node(response, node)

    assert "dois" not in record
    assert "additional_files" not in record
    assert isinstance(record, HEPRecord)

コード例 #12

0

ファイルを表示

def test_author_with_email():
    """Test getting author email. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="addendum" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <contrib-group content-type="authors">
            <contrib contrib-type="author" corresp="yes"><name><surname>Sname</surname><given-names>Fname</given-names></name><email>[email protected]</email><xref ref-type="aff" rid="AFF1"/><xref ref-type="corresp" rid="FN1">a</xref></contrib>
            </contrib-group>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]
    record = spider.parse_node(response, node)

    assert 'email' in record['authors'][0]
    assert record['authors'][0]['email'] == "*****@*****.**"

コード例 #13

0

ファイルを表示

def package_rich(tarbzfile):
    """Extract tar.gz package with 'rich' XML file."""
    spider = edp_spider.EDPSpider()
    response = fake_response_from_string(text="", url="file://" + tarbzfile)
    return next(spider.handle_package_file(response))

コード例 #14

0

ファイルを表示

def package_jats(targzfile):
    """Extract tar.gz package with JATS XML file."""
    spider = edp_spider.EDPSpider()
    response = fake_response_from_string(text="", url="file://" + targzfile)
    return spider.handle_package_file(response).next()