def test_block_empty_urls(): """ Don't ingest rows with empty URLs. """ Institution.ingest_world( 'osp.test.institutions.models.institution', 'fixtures/ingest_world/empty_urls.csv', ) assert Institution.select().count() == 1 assert Institution.select().where(Institution.name == 'inst1')
def test_skip_us_rows(): """ "US" institutions should be ignored. """ Institution.ingest_world( 'osp.test.institutions.models.institution', 'fixtures/ingest_world/skip_us_rows.csv', ) assert Institution.select().count() == 2 assert Institution.select().where(Institution.name == 'inst1') assert Institution.select().where(Institution.name == 'inst2') assert not Institution.select().where(Institution.name == 'inst3')
def test_skip_us_rows(): """ "US" institutions should be ignored. """ Institution.ingest_world( 'osp.test.institutions.models.institution', 'fixtures/ingest_world/skip_us_rows.csv', ) assert Institution.select().count() == 2 assert Institution.select().where(Institution.name=='inst1') assert Institution.select().where(Institution.name=='inst2') assert not Institution.select().where(Institution.name=='inst3')
def institution(self): """ Get the document's institution, if any. Returns: Institution """ return (Institution.select().join(Institution_Document).join( Document).where(Document.id == self.document).first())
def test_insert_rows(): """ Institution.ingest_world() should load rows. """ Institution.ingest_world( 'osp.test.institutions.models.institution', 'fixtures/ingest_world/insert_rows.csv', ) assert Institution.select().count() == 3 for i in map(str, [1, 2, 3]): assert Institution.select().where( Institution.name == 'inst{0}'.format(i), Institution.url == 'http://inst{0}.edu'.format(i), Institution.state == None, Institution.country == 'C{0}'.format(i), )
def link(cls): """ Link documents -> institutions. """ domain_to_inst = defaultdict(list) # Map domain -> [(regex, inst), ...] for inst in ServerSide(Institution.select()): domain = parse_domain(inst.url) regex = seed_to_regex(inst.url) domain_to_inst[domain].append((regex, inst)) for doc in query_bar(Document.select()): try: # TODO: Get rid of @property. url = doc.syllabus.url domain = parse_domain(url) # Find institutions with matching URLs. matches = [] for pattern, inst in domain_to_inst[domain]: match = pattern.search(url) if match: matches.append((match.group(), inst)) if matches: # Sort by length of match, descending. matches = sorted( matches, key=lambda x: len(x[0]), reverse=True, ) # Link to the institution with the longest match. cls.create( institution=matches[0][1], document=doc, ) except Exception as e: print(e)
def test_insert_rows(): """ Institution.ingest_world() should load rows. """ Institution.ingest_world( 'osp.test.institutions.models.institution', 'fixtures/ingest_world/insert_rows.csv', ) assert Institution.select().count() == 3 for i in map(str, [1, 2, 3]): assert Institution.select().where( Institution.name=='inst{0}'.format(i), Institution.url=='http://inst{0}.edu'.format(i), Institution.domain=='inst{0}.edu'.format(i), Institution.state==None, Institution.country=='C{0}'.format(i), )
def es_stream_docs(cls): """ Index institutions. Yields: dict: The next document. """ for row in query_bar(Institution.select()): yield dict( _id=row.id, name=row.name, )
def es_stream_docs(cls): """ Index institutions. Yields: dict: The next document. """ for row in query_bar(Institution.select()): yield dict( _id = row.id, name = row.name, )
def test_strip_values(): """ Field values should be stripped. """ Institution.ingest_world( 'osp.test.institutions.models.institution', 'fixtures/ingest_world/strip_values.csv', ) assert Institution.select().where( Institution.name == 'inst', Institution.url == 'http://inst.edu', Institution.state == None, Institution.country == 'AU', )
def test_es_insert(es, add_institution): """ Institution_Index.es_insert() should index institutions. """ for i in range(10): add_institution('inst' + str(i)) Institution_Index.es_insert() for inst in Institution.select(): doc = config.es.get( index='institution', id=inst.id, ) assert doc['_source']['name'] == inst.name
def test_strip_values(): """ Field values should be stripped. """ Institution.ingest_world( 'osp.test.institutions.models.institution', 'fixtures/ingest_world/strip_values.csv', ) assert Institution.select().where( Institution.name=='inst', Institution.url=='http://inst.edu', Institution.domain=='inst.edu', Institution.state==None, Institution.country=='AU', )
def test_es_insert(es, add_institution): """ Institution_Index.es_insert() should index institutions. """ for i in range(10): add_institution('inst'+str(i)) Institution_Index.es_insert() for inst in Institution.select(): doc = config.es.get( index='institution', id=inst.id, ) assert doc['_source']['name'] == inst.name