Beispiel #1
0
def ingest():
    """
    Ingest institutions.
    """

    Institution.ingest_usa()
    Institution.ingest_world()
def ingest():

    """
    Ingest institutions.
    """

    Institution.ingest_usa()
    Institution.ingest_world()
def test_block_empty_urls():
    """
    Don't ingest rows with empty URLs.
    """

    Institution.ingest_world(
        'osp.test.institutions.models.institution',
        'fixtures/ingest_world/empty_urls.csv',
    )

    assert Institution.select().count() == 1

    assert Institution.select().where(Institution.name == 'inst1')
def test_strip_values():
    """
    Field values should be stripped.
    """

    Institution.ingest_world(
        'osp.test.institutions.models.institution',
        'fixtures/ingest_world/strip_values.csv',
    )

    assert Institution.select().where(
        Institution.name == 'inst',
        Institution.url == 'http://inst.edu',
        Institution.state == None,
        Institution.country == 'AU',
    )
    def institution(self):
        """
        Get the document's institution, if any.

        Returns: Institution
        """

        return (Institution.select().join(Institution_Document).join(
            Document).where(Document.id == self.document).first())
def test_strip_values():

    """
    Field values should be stripped.
    """

    Institution.ingest_world(
        'osp.test.institutions.models.institution',
        'fixtures/ingest_world/strip_values.csv',
    )

    assert Institution.select().where(
        Institution.name=='inst',
        Institution.url=='http://inst.edu',
        Institution.domain=='inst.edu',
        Institution.state==None,
        Institution.country=='AU',
    )
def test_insert_rows():
    """
    Institution.ingest_world() should load rows.
    """

    Institution.ingest_world(
        'osp.test.institutions.models.institution',
        'fixtures/ingest_world/insert_rows.csv',
    )

    assert Institution.select().count() == 3

    for i in map(str, [1, 2, 3]):

        assert Institution.select().where(
            Institution.name == 'inst{0}'.format(i),
            Institution.url == 'http://inst{0}.edu'.format(i),
            Institution.state == None,
            Institution.country == 'C{0}'.format(i),
        )
    def link(cls):

        """
        Link documents -> institutions.
        """

        domain_to_inst = defaultdict(list)

        # Map domain -> [(regex, inst), ...]
        for inst in ServerSide(Institution.select()):

            domain = parse_domain(inst.url)

            regex = seed_to_regex(inst.url)

            domain_to_inst[domain].append((regex, inst))

        for doc in query_bar(Document.select()):

            try:

                # TODO: Get rid of @property.
                url = doc.syllabus.url

                domain = parse_domain(url)

                # Find institutions with matching URLs.
                matches = []
                for pattern, inst in domain_to_inst[domain]:

                    match = pattern.search(url)

                    if match:
                        matches.append((match.group(), inst))

                if matches:

                    # Sort by length of match, descending.
                    matches = sorted(
                        matches,
                        key=lambda x: len(x[0]),
                        reverse=True,
                    )

                    # Link to the institution with the longest match.
                    cls.create(
                        institution=matches[0][1],
                        document=doc,
                    )

            except Exception as e:
                print(e)
def test_insert_rows():

    """
    Institution.ingest_world() should load rows.
    """

    Institution.ingest_world(
        'osp.test.institutions.models.institution',
        'fixtures/ingest_world/insert_rows.csv',
    )

    assert Institution.select().count() == 3

    for i in map(str, [1, 2, 3]):

        assert Institution.select().where(
            Institution.name=='inst{0}'.format(i),
            Institution.url=='http://inst{0}.edu'.format(i),
            Institution.domain=='inst{0}.edu'.format(i),
            Institution.state==None,
            Institution.country=='C{0}'.format(i),
        )
    def es_stream_docs(cls):
        """
        Index institutions.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Institution.select()):

            yield dict(
                _id=row.id,
                name=row.name,
            )
    def es_stream_docs(cls):

        """
        Index institutions.

        Yields:
            dict: The next document.
        """

        for row in query_bar(Institution.select()):

            yield dict(
                _id = row.id,
                name = row.name,
            )
Beispiel #12
0
def test_es_insert(es, add_institution):
    """
    Institution_Index.es_insert() should index institutions.
    """

    for i in range(10):
        add_institution('inst' + str(i))

    Institution_Index.es_insert()

    for inst in Institution.select():

        doc = config.es.get(
            index='institution',
            id=inst.id,
        )

        assert doc['_source']['name'] == inst.name
def test_es_insert(es, add_institution):

    """
    Institution_Index.es_insert() should index institutions.
    """

    for i in range(10):
        add_institution('inst'+str(i))

    Institution_Index.es_insert()

    for inst in Institution.select():

        doc = config.es.get(
            index='institution',
            id=inst.id,
        )

        assert doc['_source']['name'] == inst.name
def test_skip_us_rows():
    """
    "US" institutions should be ignored.
    """

    Institution.ingest_world(
        'osp.test.institutions.models.institution',
        'fixtures/ingest_world/skip_us_rows.csv',
    )

    assert Institution.select().count() == 2

    assert Institution.select().where(Institution.name == 'inst1')
    assert Institution.select().where(Institution.name == 'inst2')
    assert not Institution.select().where(Institution.name == 'inst3')
    def _inst(
        name='Yale University',
        url=None,
        domain=None,
        state='CA',
        country='US',
    ):

        if not url:
            url = uuid.uuid4()

        if not domain:
            domain = uuid.uuid4()

        return Institution.create(
            name=name,
            url=url,
            domain=domain,
            state=state,
            country=country,
        )
    def _inst(
        name='Yale University',
        url=None,
        domain=None,
        state='CA',
        country='US',
    ):

        if not url:
            url = uuid.uuid4()

        if not domain:
            domain = uuid.uuid4()

        return Institution.create(
            name=name,
            url=url,
            domain=domain,
            state=state,
            country=country,
        )
def test_skip_us_rows():

    """
    "US" institutions should be ignored.
    """

    Institution.ingest_world(
        'osp.test.institutions.models.institution',
        'fixtures/ingest_world/skip_us_rows.csv',
    )

    assert Institution.select().count() == 2

    assert Institution.select().where(Institution.name=='inst1')
    assert Institution.select().where(Institution.name=='inst2')
    assert not Institution.select().where(Institution.name=='inst3')