def test_url_pipeline():
    document = Document.from_url("http://www.google.com")
    new_document_store = LocalDocumentStore()

    stats = Pipeline(document).add_step(TextParser(encoding='ISO-8859-1')).add_step(
        DocumentStoreWriter(new_document_store)).run().statistics

    assert stats.documents_processed == 1
    assert stats.document_exceptions == 0
    assert new_document_store.count() == 1

    new_doc = new_document_store.get_latest_document("http://www.google.com")
    print(new_doc.content_node.get_all_content())
Exemple #2
0
    def from_url(url, headers=None, *args, **kwargs):
        """Build a new pipeline with the input being a document created from the given URL

        Args:
          url: The URL ie. https://www.google.com
          headers: A dictionary of headers (Default value = None)
          *args:
          **kwargs:

        Returns:
          A new instance of a pipeline

        """
        return Pipeline(Document.from_url(url, headers), *args, **kwargs)
Exemple #3
0
def test_get_source():
    document = Document.from_url('https://www.google.com')

    with get_source(document) as fh:
        data = fh.read()
        print(data)