Beispiel #1
0
def test_addendum_jats():
    """Test parsing when article type is addendum. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="addendum" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <article-id pub-id-type="publisher-id">aa14485-10</article-id><article-id pub-id-type="other">2010A%26A...516A..97N</article-id>
                <title-group>
                    <article-title xml:lang="en">Dielectronic recombination of argon-like ions</article-title>
                </title-group>
                <related-article ext-link-type="doi" href="10.1051/0004-6361/201014485">
                </related-article>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert "related_article_doi" in record
    assert record["related_article_doi"][0][
        "value"] == "10.1051/0004-6361/201014485"
Beispiel #2
0
def parsed_node():
    """Call parse_node function with a direct link"""
    url = "http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf"
    responses.add(responses.HEAD,
                  url,
                  status=200,
                  content_type='application/pdf')
    spider = base_spider.BaseSpider()
    body = """
    <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
    <record>
        <metadata>
            <base_dc:dc xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://oai.base-search.net/base_dc/         http://oai.base-search.net/base_dc/base_dc.xsd">
            <base_dc:link>http://www.example.com/bitstream/1885/10005/1/Butt_R.D._2003.pdf</base_dc:link>
            </base_dc:dc>
        </metadata>
        </record>
    </OAI-PMH>
    """
    response = fake_response_from_string(text=body)
    node = get_node(spider, 'OAI-PMH:record', text=body)
    response.meta["record"] = node[0].extract()

    parsed_item = spider.parse_node(response, node[0])
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
Beispiel #3
0
def parsed_node_missing_scheme():
    """Call parse_node function with a link missing a http identifier."""
    spider = base_spider.BaseSpider()
    body = """
    <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
    <record>
        <metadata>
            <base_dc:dc xmlns:base_dc="http://oai.base-search.net/base_dc/" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://oai.base-search.net/base_dc/         http://oai.base-search.net/base_dc/base_dc.xsd">
            <base_dc:link>www.example.com</base_dc:link>
            </base_dc:dc>
        </metadata>
        </record>
    </OAI-PMH>
    """
    response = fake_response_from_string(text=body)
    node = get_node(spider, 'OAI-PMH:record', text=body)
    response.meta["record"] = node.extract_first()

    with requests_mock.Mocker() as mock:
        mock.head(
            'http://www.example.com',
            headers={
                'Content-Type': 'text/html',
            },
        )
        return spider.parse_node(response, node)
Beispiel #4
0
def record_references_only():
    """Parse references."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0">
        <back>
        <ref-list>
        <title>References</title>
            <ref id="R5"><label>5.</label><mixed-citation publication-type="journal" id="a"><string-name><given-names>R.V.</given-names> <surname>Krishnan</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Panneerselvam</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Manikandan</surname></string-name> <string-name><given-names>M.P.</given-names> <surname>Antony</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Nagarajan</surname></string-name>, <source>J. Nucl. Radiochem. Sci.</source>, <volume>10</volume>.<issue>1</issue>, <fpage>19</fpage>–<lpage>26</lpage> (<year>2009</year>).</mixed-citation></ref>

            <ref id="R44"><label>44.</label><mixed-citation publication-type="journal"><string-name><given-names>L.</given-names> <surname>Cronin</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Sojka</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Lefebvre</surname></string-name>, <source>SAE Technical Paper</source>, DOI: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.4271/852086">10.4271/852086</ext-link>, (<year>1985</year>)</mixed-citation></ref>

            <ref id="R3"><label>3.</label><mixed-citation publication-type="book"><string-name><given-names>T.</given-names> <surname>Aliyev</surname></string-name>, <string-name><given-names>Т.</given-names> <surname>Belyaev</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Gallagher</surname></string-name> <article-title>Simulation in ANSYS flow to the gas purification section of the multicomponent gas mixture through the dust cyclone CKBN GP-628</article-title>. <source>Mechanical engineering</source>, <publisher-loc>Moscow</publisher-loc>, №<issue>10</issue>, (<year>2014</year>).</mixed-citation></ref>

        </ref-list>
        </back>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
Beispiel #5
0
def test_no_dois_jats():
    """Test parsing when no DOI in record. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <article-id pub-id-type="publisher-id">aa14485-10</article-id><article-id pub-id-type="other">2010A%26A...516A..97N</article-id>
                <title-group>
                    <article-title xml:lang="en">Dielectronic recombination of argon-like ions</article-title>
                </title-group>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert "dois" not in record
    assert isinstance(record, HEPRecord)
Beispiel #6
0
def erratum_open_access_record():
    """Return results generator from the WSP spider."""
    spider = iop_spider.IOPSpider()
    body = """
    <ArticleSet>
        <Article>
            <Journal>
                <PublisherName>Institute of Physics</PublisherName>
                <JournalTitle>J. Phys.: Conf. Ser.</JournalTitle>
                <Volume>143</Volume>
                <Issue>3</Issue>
            </Journal>
            <FirstPage LZero="save">336</FirstPage>
        <PublicationType>Published Erratum</PublicationType>
        </Article>
    </ArticleSet>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "Article", response)
    spider.pdf_files = get_test_suite_path(
        'responses',
        'iop',
        'pdf',
    )

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
Beispiel #7
0
def record_references_only():
    """Parse references."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0">
        <back>
        <ref-list>
        <title>References</title>
            <ref id="R5"><label>5.</label><mixed-citation publication-type="journal" id="a"><string-name><given-names>R.V.</given-names> <surname>Krishnan</surname></string-name>, <string-name><given-names>G.</given-names> <surname>Panneerselvam</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Manikandan</surname></string-name> <string-name><given-names>M.P.</given-names> <surname>Antony</surname></string-name>, <string-name><given-names>K.</given-names> <surname>Nagarajan</surname></string-name>, <source>J. Nucl. Radiochem. Sci.</source>, <volume>10</volume>.<issue>1</issue>, <fpage>19</fpage>–<lpage>26</lpage> (<year>2009</year>).</mixed-citation></ref>

            <ref id="R44"><label>44.</label><mixed-citation publication-type="journal"><string-name><given-names>L.</given-names> <surname>Cronin</surname></string-name>, <string-name><given-names>P.</given-names> <surname>Sojka</surname></string-name>, <string-name><given-names>A.</given-names> <surname>Lefebvre</surname></string-name>, <source>SAE Technical Paper</source>, DOI: <ext-link ext-link-type="uri" xlink:href="http://dx.doi.org/10.4271/852086">10.4271/852086</ext-link>, (<year>1985</year>)</mixed-citation></ref>

            <ref id="R3"><label>3.</label><mixed-citation publication-type="book"><string-name><given-names>T.</given-names> <surname>Aliyev</surname></string-name>, <string-name><given-names>Т.</given-names> <surname>Belyaev</surname></string-name>, <string-name><given-names>S.</given-names> <surname>Gallagher</surname></string-name> <article-title>Simulation in ANSYS flow to the gas purification section of the multicomponent gas mixture through the dust cyclone CKBN GP-628</article-title>. <source>Mechanical engineering</source>, <publisher-loc>Moscow</publisher-loc>, №<issue>10</issue>, (<year>2014</year>).</mixed-citation></ref>

        </ref-list>
        </back>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
Beispiel #8
0
def parse_without_splash():
    """Test parsing the XML without splash page links."""
    spider = dnb_spider.DNBSpider()
    body = """
    <OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">
    <ListRecords xsi:schemaLocation="http://www.loc.gov/MARC21/slim http://www.loc.gov/standards/marcxml/schema/MARC21slim.xsd">
        <record>
        <metadata>
            <slim:record xmlns:slim="http://www.loc.gov/MARC21/slim" type="Bibliographic">
                <slim:datafield tag="856" ind1=" " ind2="0">
                    <slim:subfield code="u">http://d-nb.info/1079912991/34</slim:subfield>
                </slim:datafield>
            </slim:record>
        </metadata>
        </record>
    </ListRecords>
    </OAI-PMH>
    """
    response = fake_response_from_string(body)
    nodes = get_node(spider, "//" + spider.itertag, response)

    with requests_mock.Mocker() as mock:
        mock.head('http://d-nb.info/1079912991/34',
                  headers={
                      'Content-Type': 'application/pdf;charset=base64',
                  })

        parsed_item = spider.parse_node(response, nodes[0])
        assert parsed_item
        assert parsed_item.record

        return parsed_item.record
Beispiel #9
0
def test_no_author_no_date_no_url():
    """Parse the node in the listing without author, date, or url. Should
    take straight to `build_item` and build the HEPRecord.
    """
    spider = magic_spider.MagicSpider()
    body = """
    <html>
        <body id="f1d">
            <table class="list" style="margin-left: 20px; width: 920px;">
                <tr class="odd">
                    <td><a>Limits to the violation of...</a></td>
                </tr>
            </table>
        </body>
    </html>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, spider.itertag, text=body)
    parsed_item = spider.parse_node(response, node).next()
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert isinstance(record, hepcrawl.items.HEPRecord)
    assert "date" not in record
    assert "authors" not in record
Beispiel #10
0
def test_no_spash_page():
    """Test that when url was found but could not be reached, build the
    record with the available data.
    """
    spider = magic_spider.MagicSpider()
    body = """
    <html>
        <body id="f1d">
            <table class="list" style="margin-left: 20px; width: 920px;">
                <tr class="odd">
                    <td>
                    <a href="http://non_reachable_url/">Limits to the violation of...</a>
                    </td>
                </tr>
            </table>
        </body>
    </html>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, spider.itertag, text=body)
    parsed_node = spider.parse_node(response, node)

    response.status = 404
    response.meta["title"] = parsed_node.meta["title"]
    response.meta["urls"] = parsed_node.meta["urls"]
    parsed_item = spider.scrape_for_pdf(response).next()
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert isinstance(record, hepcrawl.items.HEPRecord)
    assert "urls" in record
    assert "title" in record
    assert record["urls"][0]["value"] == "http://non_reachable_url/"
    assert record["title"] == "Limits to the violation of..."
Beispiel #11
0
def test_addendum_jats():
    """Test parsing when article type is addendum. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="addendum" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <article-id pub-id-type="publisher-id">aa14485-10</article-id><article-id pub-id-type="other">2010A%26A...516A..97N</article-id>
                <title-group>
                    <article-title xml:lang="en">Dielectronic recombination of argon-like ions</article-title>
                </title-group>
                <related-article ext-link-type="doi" href="10.1051/0004-6361/201014485">
                </related-article>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert "related_article_doi" in record
    assert record["related_article_doi"][0][
        "value"] == "10.1051/0004-6361/201014485"
Beispiel #12
0
def test_no_dois_jats():
    """Test parsing when no DOI in record. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <article-id pub-id-type="publisher-id">aa14485-10</article-id><article-id pub-id-type="other">2010A%26A...516A..97N</article-id>
                <title-group>
                    <article-title xml:lang="en">Dielectronic recombination of argon-like ions</article-title>
                </title-group>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert "dois" not in record
    assert isinstance(record, HEPRecord)
Beispiel #13
0
def erratum_open_access_record():
    """Return results generator from the WSP spider."""
    spider = iop_spider.IOPSpider()
    body = """
    <ArticleSet>
        <Article>
            <Journal>
                <PublisherName>Institute of Physics</PublisherName>
                <JournalTitle>J. Phys.: Conf. Ser.</JournalTitle>
                <Volume>143</Volume>
                <Issue>3</Issue>
            </Journal>
            <FirstPage LZero="save">336</FirstPage>
        <PublicationType>Published Erratum</PublicationType>
        </Article>
    </ArticleSet>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "Article", response)
    spider.pdf_files = get_test_suite_path(
        'responses',
        'iop',
        'pdf',
    )

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
Beispiel #14
0
def parsed_node_no_splash():
    """Return a parse call to a record without spalsh page url."""
    spider = brown_spider.BrownSpider()
    body = """
    {
    "items": {
        "docs": [
            {
                "json_uri": "https://repository.library.brown.edu/api/pub/items/bdr:11303/"

            }
        ]
    }
    }
    """

    response = fake_response_from_string(body)
    jsonresponse = json.loads(response.body_as_unicode())
    jsonrecord = jsonresponse["items"]["docs"][0]
    response.meta["jsonrecord"] = jsonrecord

    parsed_item = spider.parse(response).next()
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
Beispiel #15
0
def test_handle_package_ftp(tarbzfile):
    """Test getting the target folder name for xml files."""
    spider = edp_spider.EDPSpider()
    response = fake_response_from_string(text=tarbzfile)
    request = spider.handle_package_ftp(response).next()

    assert isinstance(request, Request)
    assert request.meta["source_folder"] == tarbzfile
Beispiel #16
0
def test_handle_package_ftp(tarbzfile):
    """Test getting the target folder name for xml files."""
    spider = edp_spider.EDPSpider()
    response = fake_response_from_string(text=tarbzfile)
    request = next(spider.handle_package_ftp(response))

    assert isinstance(request, Request)
    assert request.meta["source_folder"] == tarbzfile
Beispiel #17
0
def test_no_valid_article():
    """Test parsing when filtering out non-interesting article types."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="lecture" xml:lang="en" dtd-version="3.0">
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]
    record = spider.parse_node(response, node)

    assert record is None
Beispiel #18
0
def test_no_valid_article():
    """Test parsing when filtering out non-interesting article types."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="lecture" xml:lang="en" dtd-version="3.0">
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]
    record = spider.parse_node(response, node)

    assert record is None
Beispiel #19
0
def non_thesis():
    """Return a heprecord for a Master's thesis (should be None as we don't
    want them)."""
    spider = phenix_spider.PhenixSpider()
    body = """
    <ul>
    <li><b>M.Sc. Author</b>:
    "This is an Master's thesis, not a PhD", &nbsp; M.Sc. thesis at Master Science University, 2016,&nbsp;
    <br><br>
    </ul>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, '//li', text=body)
    return spider.parse_node(response, node)
Beispiel #20
0
def no_year_no_author():
    """Test behaviour when no year given in thesis info line."""
    spider = brown_spider.BrownSpider()
    body = """
    <html>
        <div class="panel-body">
            <dl class="">
                <dt>Notes</dt>
                <dd>Thesis (Ph.D. -- Brown University</dd>
            </dl>
        </div>
    </html>

    """
    return fake_response_from_string(body)
Beispiel #21
0
def non_thesis():
    """Return a heprecord for a Master's thesis (should be None as we don't
    want them)."""
    spider = mit_spider.MITSpider()
    body = """
    <html>
        <body>
            <tr class="ds-table-row odd ">
                <td class="label-cell">dc.description.degree</td>
                <td>M.Sc.</td>
                <td>en_US</td>
            </tr>
        </body>
    </html>
    """
    response = fake_response_from_string(body)
    return spider.build_item(response)
Beispiel #22
0
def test_collections_review():
    """Test collections when doctype is review. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="review-article" xml:lang="en" dtd-version="3.0">
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert "collections" in record
    assert record["collections"] == [{'primary': 'HEP'}, {'primary': 'Review'}]
Beispiel #23
0
def test_collections_review():
    """Test collections when doctype is review. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="review-article" xml:lang="en" dtd-version="3.0">
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert "collections" in record
    assert record["collections"] == [{'primary': 'HEP'}, {'primary': 'Review'}]
Beispiel #24
0
def test_non_thesis():
    """Test MSc thesis skipping.

    Return a HEPrecord for a Master's thesis (should be None as we don't
    want them)."""
    spider = infn_spider.InfnSpider()
    body = """
    <html>
    <body>
    <tr valign="top">
      <td align="left" class="intest"> Tipo</td>
      <td align="left" class="bordo">Magister</td>
    </tr>
    </body>
    </html>
    """
    response = fake_response_from_string(body)
    record = spider.scrape_splash(response)

    assert record is None
Beispiel #25
0
def test_no_dois_rich():
    """Test parsing when no DOI in record. 'Rich' format."""
    spider = edp_spider.EDPSpider()
    body = """
    <EDPSArticle>
        <ArticleID Type="Article">
            <EDPSRef>ds1691</EDPSRef>
        </ArticleID>
    </EDPSArticle>
    """
    response = fake_response_from_string(body)
    response.meta["rich"] = True
    node = get_node(spider, "//EDPSArticle", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert "dois" not in record
    assert isinstance(record, HEPRecord)
Beispiel #26
0
def test_no_dois_rich():
    """Test parsing when no DOI in record. 'Rich' format."""
    spider = edp_spider.EDPSpider()
    body = """
    <EDPSArticle>
        <ArticleID Type="Article">
            <EDPSRef>ds1691</EDPSRef>
        </ArticleID>
    </EDPSArticle>
    """
    response = fake_response_from_string(body)
    response.meta["rich"] = True
    node = get_node(spider, "//EDPSArticle", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert "dois" not in record
    assert isinstance(record, HEPRecord)
Beispiel #27
0
def test_no_aff():
    """Test the result of calling `scrape_for_pdf` without author
    affiliation. Should be a HEPRecord."""
    spider = magic_spider.MagicSpider()
    body = """
    <html>
    <div id="content">
        <h3 class="pub_title">Limits to the violation of Lorentz...</h3>
        <p class="author">Daniel Garrido Terrats</p>
    </div>
    </html>
    """
    response = fake_response_from_string(body)
    parsed_item = spider.scrape_for_pdf(response).next()
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert isinstance(record, hepcrawl.items.HEPRecord)
    assert "date" not in record
    assert "affiliations" not in record["authors"]
Beispiel #28
0
def supervisors():
    """Response from a record with multiple supervisors."""
    spider = mit_spider.MITSpider()
    body = """
    <html>
        <body>
            <tr class="ds-table-row odd ">
                <td class="label-cell">dc.contributor.advisor</td>
                <td>Seth Lloyd and J.D. Joannopoulos</td>
                <td>en_US</td>
            </tr>
        <body>
    <html>
    """
    response = fake_response_from_string(body)

    parsed_item = spider.build_item(response)
    assert parsed_item
    assert parsed_item.record

    return parsed_item.record
Beispiel #29
0
def test_aff_with_email():
    """Test popping email from the affiliation string. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Gorczyca</surname>
                        <given-names>T. W.</given-names>
                    </name>
                    <xref ref-type="aff" rid="AFF1">1</xref>
                </contrib>
                <aff id="AFF1">
                    <label>1</label>
                    <addr-line>Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA e-mail: [email protected]
                    </addr-line>
                </aff>
            <contrib-group>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    affiliation = "Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA"
    assert 'affiliations' in record['authors'][0]
    assert record['authors'][0]['affiliations'][0]['value'] == affiliation
    assert "e-mail" not in record['authors'][0]['affiliations'][0]['value']
    assert record['authors'][0]['email'] is None
Beispiel #30
0
def test_aff_with_email():
    """Test popping email from the affiliation string. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="research-article" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <contrib-group>
                <contrib contrib-type="author">
                    <name>
                        <surname>Gorczyca</surname>
                        <given-names>T. W.</given-names>
                    </name>
                    <xref ref-type="aff" rid="AFF1">1</xref>
                </contrib>
                <aff id="AFF1">
                    <label>1</label>
                    <addr-line>Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA e-mail: [email protected]
                    </addr-line>
                </aff>
            <contrib-group>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    affiliation = "Department of Physics, Western Michigan University, Kalamazoo, MI 49008, USA"
    assert 'affiliations' in record['authors'][0]
    assert record['authors'][0]['affiliations'][0]['value'] == affiliation
    assert "e-mail" not in record['authors'][0]['affiliations'][0]['value']
    assert record['authors'][0]['email'] is None
Beispiel #31
0
def test_not_published_record():
    """Not-published paper should result in nothing."""
    spider = iop_spider.IOPSpider()
    body = """
    <ArticleSet>
        <Article>
            <Journal>
                <PubDate PubStatus="aheadofprint">
                    <Year>2015</Year>
                    <Month>03</Month>
                </PubDate>
            </Journal>
        </Article>
    </ArticleSet>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "Article", response)
    spider.pdf_files = get_test_suite_path(
        'responses',
        'iop',
        'pdf',
    )
    records = spider.parse_node(response, node)
    assert records is None
Beispiel #32
0
def test_author_with_email():
    """Test getting author email. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="addendum" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <contrib-group content-type="authors">
            <contrib contrib-type="author" corresp="yes"><name><surname>Sname</surname><given-names>Fname</given-names></name><email>[email protected]</email><xref ref-type="aff" rid="AFF1"/><xref ref-type="corresp" rid="FN1">a</xref></contrib>
            </contrib-group>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert 'email' in record['authors'][0]
    assert record['authors'][0]['email'] == "*****@*****.**"
Beispiel #33
0
def test_not_published_record():
    """Not-published paper should result in nothing."""
    spider = iop_spider.IOPSpider()
    body = """
    <ArticleSet>
        <Article>
            <Journal>
                <PubDate PubStatus="aheadofprint">
                    <Year>2015</Year>
                    <Month>03</Month>
                </PubDate>
            </Journal>
        </Article>
    </ArticleSet>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "Article", response)
    spider.pdf_files = get_test_suite_path(
        'responses',
        'iop',
        'pdf',
    )
    records = spider.parse_node(response, node)
    assert records is None
Beispiel #34
0
def test_author_with_email():
    """Test getting author email. JATS format."""
    spider = edp_spider.EDPSpider()
    body = """
    <article xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" article-type="addendum" xml:lang="en" dtd-version="3.0">
        <front>
            <article-meta>
            <contrib-group content-type="authors">
            <contrib contrib-type="author" corresp="yes"><name><surname>Sname</surname><given-names>Fname</given-names></name><email>[email protected]</email><xref ref-type="aff" rid="AFF1"/><xref ref-type="corresp" rid="FN1">a</xref></contrib>
            </contrib-group>
            </article-meta>
        </front>
    </article>
    """
    response = fake_response_from_string(body)
    node = get_node(spider, "//article", response)[0]

    parsed_item = spider.parse_node(response, node)
    assert parsed_item
    assert parsed_item.record
    record = parsed_item.record

    assert 'email' in record['authors'][0]
    assert record['authors'][0]['email'] == "*****@*****.**"
Beispiel #35
0
def package_rich(tarbzfile):
    """Extract tar.gz package with 'rich' XML file."""
    spider = edp_spider.EDPSpider()
    response = fake_response_from_string(text="", url="file://" + tarbzfile)
    return next(spider.handle_package_file(response))
Beispiel #36
0
def package_rich(tarbzfile):
    """Extract tar.gz package with 'rich' XML file."""
    spider = edp_spider.EDPSpider()
    response = fake_response_from_string(text="", url="file://" + tarbzfile)
    return spider.handle_package_file(response).next()
Beispiel #37
0
def package_jats(targzfile):
    """Extract tar.gz package with JATS XML file."""
    spider = edp_spider.EDPSpider()
    response = fake_response_from_string(text="", url="file://" + targzfile)
    return spider.handle_package_file(response).next()