def test_import_no_owner(self): '''Tests the importing of a batch of FB pages as Places without owner importing disabled.''' no_owner_stored = '84714961156' # sqaure cafe (org and place not in fixture) owner_stored = '50141015898' # voluto coffee (org in fixture but not place) before_orgs = list(Organization.objects.all()) before_records = list(FacebookOrgRecord.objects.all()) mgr = PageImportManager() # ensure no org is created result = mgr.import_place(no_owner_stored, import_owners=False) self.assertIsNone(result.model_instance.owner) # ensure the existing org is found, even without import result = mgr.import_place(owner_stored, import_owners=False) self.assertIsNotNone(result.model_instance) self.assertEquals( result.model_instance.owner, FacebookOrgRecord.objects.get(fb_id=owner_stored).organization) # double check that the Organization and FacebookOrgRecord tables weren't touched self.assertEquals(before_orgs, list(Organization.objects.all())) self.assertEquals(before_records, list(FacebookOrgRecord.objects.all()))
def test_pulling(self): ''' Tests internal FB page info gathering code -- not model importing. ''' page_ids = [ '84714961156', # Square Cafe '9423481220941280' ] # invalid fbid # add 100 random ids to the list to ensure batch code is working well page_ids.extend([str(random.randint(1, 1e12)) for i in range(100)]) mgr = PageImportManager() page_infos = mgr.pull_page_info(page_ids) self.assertEquals(len(page_infos), len(page_ids)) # can't really assert anything about some third party pages. be content # with just testing that there's a few of them and the first one has some # page-specific fields valid_info = page_infos[0] self.assertIn('name', valid_info.keys()) self.assertIn('location', valid_info.keys()) # this is a Place page # the bogus id shouldn't be cached invalid_info = page_infos[1] self.assertTrue(isinstance(invalid_info, FacebookAPIError))
def import_pageinfos(): all_page_ids = [ str(page.fb_id) for page in FacebookPage.objects.filter(pageinfo_json='') ] page_mgr = PageImportManager() offset, step_size = 0, 2000 importlog.info('Pulling %d pages from Facebook (step of %d)' % (len(all_page_ids), step_size)) while offset < len(all_page_ids): page_ids = all_page_ids[offset:min(offset + step_size, len(all_page_ids) - 1)] infos = page_mgr.pull_page_info(page_ids) importlog.info('Importing %d page infos into database' % len(page_ids)) for pid, info in zip(page_ids, infos): if not isinstance(info, dict): importlog.info('Bad response for page id %s' % str(pid)) continue info.pop('metadata', None) record = FacebookPage.objects.get(fb_id=pid) record.pageinfo_json = json.dumps(info) record.save() offset += step_size
def test_import(self): '''Tests the importing of a batch of FB pages as Places''' mgr = PageImportManager() pid_notice_pairs = [ ('84714961156', None), # Square Cafe ('139288502700', TypeError), # Pgh Marathon (no location) ('291107654260858', TypeError), # event page ('9423481220941280', FacebookAPIError), # bogus id ('53379078585', PageImportReport.ModelInstanceExists), # big dog ] random.shuffle(pid_notice_pairs) # grab original FB records from any pages that already exist original_fb_records = {} for pid, notice in pid_notice_pairs: if notice is PageImportReport.ModelInstanceExists: original_fb_records[pid] = ExternalPlaceSource.facebook.get( uid=pid) pids = [pair[0] for pair in pid_notice_pairs] # run insertion code mgr.pull_page_info(pids) # cache pages results = [mgr.import_place(pid) for pid in pids] self.assertEquals([result.page_id for result in results], [pid for pid, _ in pid_notice_pairs], 'non-parallel list of PageImportReports returned') for pair, result in zip(pid_notice_pairs, results): pid, expected_notice = pair if not expected_notice: self.assertEquals([], result.notices) # assert a new model instance was created and it's FB record matches what was returned try: place = ExternalPlaceSource.facebook.get(uid=pid).place except ExternalPlaceSource.DoesNotExist: self.fail('No place record for fbid %s' % pid) if place != result.model_instance: self.fail('No place created for fbid %s' % pid) else: # assert no model instance is returned self.assertIsNone(result.model_instance) # assert expected notice was generated self.assertEquals(len(result.notices), 1) self.assertTrue( isinstance(result.notices[0], expected_notice), 'Expecting notice %s from importing fb page %s' % (str(expected_notice), pid)) # if notice was a ModelInstanceExists, be sure the original record wasn't touched if expected_notice is PageImportReport.ModelInstanceExists: self.assertEquals( original_fb_records[pid], ExternalPlaceSource.facebook.get(uid=pid)) # otherwise, make sure no record was created at all else: with self.assertRaises(ExternalPlaceSource.DoesNotExist): ExternalPlaceSource.facebook.get(uid=pid)
def test_import(self): """Tests the importing of a batch of FB pages as Orgs""" mgr = PageImportManager() pid_notice_pairs = [ ("84714961156", None), # Square Cafe ("139288502700", None), # Pgh Marathon ("220439", TypeError), # user page ("291107654260858", TypeError), # event page ("9423481220941280", FacebookAPIError), # bogus id ("53379078585", PageImportReport.ModelInstanceExists), ] random.shuffle(pid_notice_pairs) # grab original FB records from any pages that already exist original_fb_records = {} for pid, notice in pid_notice_pairs: if notice is PageImportReport.ModelInstanceExists: original_fb_records[pid] = FacebookOrgRecord.objects.get(fb_id=pid) pids = [pair[0] for pair in pid_notice_pairs] # run insertion code mgr.pull_page_info(pids) # cache pages results = [mgr.import_org(pid) for pid in pids] self.assertEquals( [result.page_id for result in results], [pid for pid, _ in pid_notice_pairs], "non-parallel list of PageImportReports returned", ) for pair, result in zip(pid_notice_pairs, results): pid, expected_notice = pair if not expected_notice: self.assertEquals([], result.notices) # assert a new model instance was created and it's FB record matches what was returned try: org = FacebookOrgRecord.objects.get(fb_id=pid).organization except FacebookOrgRecord.DoesNotExist: self.fail("No organization record for fbid %s" % pid) if org != result.model_instance: self.fail("No organization created for fbid %s" % pid) else: # assert no model instance is returned self.assertIsNone(result.model_instance) # assert expected notice was generated self.assertEquals(len(result.notices), 1) self.assertTrue( isinstance(result.notices[0], expected_notice), "Expecting notice %s from importing fb page %s" % (str(expected_notice), pid), ) # if notice was a ModelInstanceExists, be sure the original record wasn't touched if expected_notice is PageImportReport.ModelInstanceExists: self.assertEquals(original_fb_records[pid], FacebookOrgRecord.objects.get(fb_id=pid)) # otherwise, make sure no record was created at all else: with self.assertRaises(FacebookOrgRecord.DoesNotExist): FacebookOrgRecord.objects.get(fb_id=pid)
def import_ids(page_ids): page_mgr = PageImportManager() importlog.info('Pulling %d pages from Facebook' % len(page_ids)) importlog.info('Importing Organizations into database from %d pages' % len(page_ids)) page_infos = page_mgr.pull_page_info(page_ids) # cache pages importlog.info('Refresing cached FB page infos') for pid,info in zip(page_ids,page_infos): if not isinstance(info,dict): importlog.info('Cannot store page info JSON for fb id %s' % str(pid)) continue info.pop('metadata',None) # don't need to store metadata if it exists record, _ = FacebookPage.objects.get_or_create(fb_id=pid) record.pageinfo_json = json.dumps(info) record.save() import_count = 0 for pid in page_ids: report = page_mgr.import_org(pid) if report.notices: for notice in report.notices: if isinstance(notice,PageImportReport.ModelInstanceExists): importlog.info('%s: Record for Organization exists' % report.page_id) else: importlog.error('%s: %s' % (report.page_id,unicode(notice))) else: importlog.info('%s: Imported successfully as %s (id=%d)' % (report.page_id, unicode(report.model_instance), report.model_instance.id)) import_count += 1 importlog.info('%d new Organizations imported' % import_count) reports = page_mgr.import_place(page_ids,import_owners=True) # generator object import_count = 0 for pid in page_ids: report = page_mgr.import_place(pid) if report.notices: for notice in report.notices: if isinstance(notice,PageImportReport.ModelInstanceExists): importlog.info('%s: Record for Place exists' % report.page_id) else: importlog.error('%s: %s' % (report.page_id,unicode(notice))) else: importlog.info('%s: Imported successfully as %s (id=%d)' % (report.page_id, unicode(report.model_instance), report.model_instance.id)) import_count += 1 importlog.info('%d new Places imported' % import_count)
def import_pageinfos(): all_page_ids = [str(page.fb_id) for page in FacebookPage.objects.filter(pageinfo_json='')] page_mgr = PageImportManager() offset, step_size = 0, 2000 importlog.info('Pulling %d pages from Facebook (step of %d)' % (len(all_page_ids),step_size)) while offset < len(all_page_ids): page_ids = all_page_ids[offset:min(offset+step_size,len(all_page_ids)-1)] infos = page_mgr.pull_page_info(page_ids) importlog.info('Importing %d page infos into database' % len(page_ids)) for pid,info in zip(page_ids,infos): if not isinstance(info,dict): importlog.info('Bad response for page id %s' % str(pid)) continue info.pop('metadata',None) record = FacebookPage.objects.get(fb_id=pid) record.pageinfo_json = json.dumps(info) record.save() offset += step_size
def test_import_no_owner(self): """Tests the importing of a batch of FB pages as Places without owner importing disabled.""" no_owner_stored = "84714961156" # sqaure cafe (org and place not in fixture) owner_stored = "50141015898" # voluto coffee (org in fixture but not place) before_orgs = list(Organization.objects.all()) before_records = list(FacebookOrgRecord.objects.all()) mgr = PageImportManager() # ensure no org is created result = mgr.import_place(no_owner_stored, import_owners=False) self.assertIsNone(result.model_instance.owner) # ensure the existing org is found, even without import result = mgr.import_place(owner_stored, import_owners=False) self.assertIsNotNone(result.model_instance) self.assertEquals(result.model_instance.owner, FacebookOrgRecord.objects.get(fb_id=owner_stored).organization) # double check that the Organization and FacebookOrgRecord tables weren't touched self.assertEquals(before_orgs, list(Organization.objects.all())) self.assertEquals(before_records, list(FacebookOrgRecord.objects.all()))
def test_pulling(self): """ Tests internal FB page info gathering code -- not model importing. """ page_ids = ["84714961156", "9423481220941280"] # Square Cafe # invalid fbid # add 100 random ids to the list to ensure batch code is working well page_ids.extend([str(random.randint(1, 1e12)) for i in range(100)]) mgr = PageImportManager() page_infos = mgr.pull_page_info(page_ids) self.assertEquals(len(page_infos), len(page_ids)) # can't really assert anything about some third party pages. be content # with just testing that there's a few of them and the first one has some # page-specific fields valid_info = page_infos[0] self.assertIn("name", valid_info.keys()) self.assertIn("location", valid_info.keys()) # this is a Place page # the bogus id shouldn't be cached invalid_info = page_infos[1] self.assertTrue(isinstance(invalid_info, FacebookAPIError))
def import_ids(page_ids): page_mgr = PageImportManager() importlog.info('Pulling %d pages from Facebook' % len(page_ids)) importlog.info('Importing Organizations into database from %d pages' % len(page_ids)) page_infos = page_mgr.pull_page_info(page_ids) # cache pages importlog.info('Refresing cached FB page infos') for pid, info in zip(page_ids, page_infos): if not isinstance(info, dict): importlog.info('Cannot store page info JSON for fb id %s' % str(pid)) continue info.pop('metadata', None) # don't need to store metadata if it exists record, _ = FacebookPage.objects.get_or_create(fb_id=pid) record.pageinfo_json = json.dumps(info) record.save() import_count = 0 for pid in page_ids: report = page_mgr.import_org(pid) if report.notices: for notice in report.notices: if isinstance(notice, PageImportReport.ModelInstanceExists): importlog.info('%s: Record for Organization exists' % report.page_id) else: importlog.error('%s: %s' % (report.page_id, unicode(notice))) else: importlog.info('%s: Imported successfully as %s (id=%d)' % (report.page_id, unicode(report.model_instance), report.model_instance.id)) import_count += 1 importlog.info('%d new Organizations imported' % import_count) reports = page_mgr.import_place(page_ids, import_owners=True) # generator object import_count = 0 for pid in page_ids: report = page_mgr.import_place(pid) if report.notices: for notice in report.notices: if isinstance(notice, PageImportReport.ModelInstanceExists): importlog.info('%s: Record for Place exists' % report.page_id) else: importlog.error('%s: %s' % (report.page_id, unicode(notice))) else: importlog.info('%s: Imported successfully as %s (id=%d)' % (report.page_id, unicode(report.model_instance), report.model_instance.id)) import_count += 1 importlog.info('%d new Places imported' % import_count)
def run(): in_filename = os.path.join(os.path.dirname(__file__), 'obid.csv') #clear all tables Location.objects.all().delete() PlaceMeta.objects.all().delete() Place.objects.all().delete() Organization.objects.all().delete() ExternalPlaceSource.objects.all().delete() FacebookPage.objects.all().delete() FacebookOrgRecord.objects.all().delete() gplaces_category_map = load_category_map('google_places') gp_hits, gp_misses = 0, 0 rows = OBIDRow.rows_from_csv(in_filename) # cycle through each row with a facebook reference and store a reference page_mgr = PageImportManager() fb_rows = [row for row in rows if row.fb_id] for row, info in zip( fb_rows, page_mgr.pull_page_info([row.fb_id for row in fb_rows])): if isinstance(info, dict): info.pop('metadata', None) # don't need to store metadata if it exists FacebookPage.objects.get_or_create( fb_id=info['id'], defaults=dict(pageinfo_json=json.dumps(info))) row.fb_id = info['id'] # ensure a numeric id else: print 'ERROR: Pulling fb page %s resulted in the following exception: "%s"' % ( str(row.fb_id), str(info)) row.fb_id = '' # cycle through all rows and store everything for i, row in enumerate(rows): if not row.place: print 'ERROR: no place for entry %d' % i # resolve the location location = resolve_location( Location(address=row.address, postcode='15213')) if location: # hack to get around Google Geocoding appending the unviersity onto all addresses if ( location.address.startswith('University') and not row.address.lower().startswith('univ') ) or \ ( location.address.startswith('Carnegie Mellon') and row.address.lower().startswith('carnegie mellon') ): location.address = ','.join(location.address.split(',')[1:]) try: # if exact match exists, use it instead of the newly found one location = Location.objects.get(address=location.address, postcode=location.postcode) except Location.DoesNotExist: location.save() else: print 'WARNING: Geocoding failed for entry %d ("%s")' % (i, row.place) diff_org = row.org != row.place org, place = None, None # import org # if the row has a fb id, we'll try to import the Org from Facebook # only import Org from Facebook if it's the same as the Place (fb id relates to place only) if row.fb_id and not diff_org: try: org = FacebookOrgRecord.objects.get(fb_id=row.fb_id) except FacebookOrgRecord.DoesNotExist: report = page_mgr.import_org(row.fb_id) if report.model_instance: org = report.model_instance else: print 'WARNING: Organization FB import failed for entry %d (fbid %s)' % ( i, str(row.fb_id)) if not org: org, created = Organization.objects.get_or_create(name=row.org) # import place if row.fb_id: try: place = ExternalPlaceSource.facebook.get(uid=row.fb_id) except ExternalPlaceSource.DoesNotExist: report = page_mgr.import_place(row.fb_id, import_owners=False) if report.model_instance: place = report.model_instance if not place.owner: # no owner is created automatically, so set it if not created place.owner = org place.save() else: print 'WARNING: Place FB import failed for entry %d (fbid %s)' % ( i, str(row.fb_id)) if not place: place, created = Place.objects.get_or_create(name=row.place, location=location, owner=org) if row.url: PlaceMeta.objects.create(place=place, meta_key='url', meta_value=row.url) if not diff_org: # also save the url as the org's url if they're the same org.url = row.url org.save() if row.phone: PlaceMeta.objects.create(place=place, meta_key='phone', meta_value=row.phone) print 'Imported %s' % row.place try: print ' (linked to FB page %s)' % ExternalPlaceSource.facebook.get( place=place).uid except ExternalPlaceSource.DoesNotExist: pass # store tags from Google Place lookup if location and \ location.latitude is not None and location.longitude is not None: coords = (location.latitude, location.longitude) radius = 1000 else: coords = (40.4425, -79.9575) radius = 5000 response = gplaces_client.search_request(coords, radius, keyword=row.place) if len(response) > 0 and 'reference' in response[0]: details = gplaces_client.details_request(response[0]['reference']) all_tags = set() for typ in details.get('types', []): if typ in gplaces_category_map: all_tags.update(gplaces_category_map[typ]) else: print 'WARNING: Unknown Google Places type: "%s"' % typ if len(all_tags) > 0: print ' Tags:', for t in all_tags: print '%s,' % t, print gp_hits += 1 else: print ' WARNING: Failure querying Google Places for "%s" within %dm of (%f,%f)' % ( row.place, radius, coords[0], coords[1]) gp_misses += 1 print gp_hits, gp_misses
def run(): in_filename = os.path.join(os.path.dirname(__file__),'obid.csv') #clear all tables Location.objects.all().delete() PlaceMeta.objects.all().delete() Place.objects.all().delete() Organization.objects.all().delete() ExternalPlaceSource.objects.all().delete() FacebookPage.objects.all().delete() FacebookOrgRecord.objects.all().delete() gplaces_category_map = load_category_map('google_places') gp_hits, gp_misses = 0,0 rows = OBIDRow.rows_from_csv(in_filename) # cycle through each row with a facebook reference and store a reference page_mgr = PageImportManager() fb_rows = [row for row in rows if row.fb_id] for row,info in zip(fb_rows,page_mgr.pull_page_info([row.fb_id for row in fb_rows])): if isinstance(info,dict): info.pop('metadata',None) # don't need to store metadata if it exists FacebookPage.objects.get_or_create(fb_id=info['id'], defaults=dict(pageinfo_json=json.dumps(info))) row.fb_id = info['id'] # ensure a numeric id else: print 'ERROR: Pulling fb page %s resulted in the following exception: "%s"' % (str(row.fb_id),str(info)) row.fb_id = '' # cycle through all rows and store everything for i,row in enumerate(rows): if not row.place: print 'ERROR: no place for entry %d' % i # resolve the location location = resolve_location(Location(address=row.address,postcode='15213')) if location: # hack to get around Google Geocoding appending the unviersity onto all addresses if ( location.address.startswith('University') and not row.address.lower().startswith('univ') ) or \ ( location.address.startswith('Carnegie Mellon') and row.address.lower().startswith('carnegie mellon') ): location.address = ','.join(location.address.split(',')[1:]) try: # if exact match exists, use it instead of the newly found one location = Location.objects.get(address=location.address,postcode=location.postcode) except Location.DoesNotExist: location.save() else: print 'WARNING: Geocoding failed for entry %d ("%s")' % (i,row.place) diff_org = row.org != row.place org, place = None, None # import org # if the row has a fb id, we'll try to import the Org from Facebook # only import Org from Facebook if it's the same as the Place (fb id relates to place only) if row.fb_id and not diff_org: try: org = FacebookOrgRecord.objects.get(fb_id=row.fb_id) except FacebookOrgRecord.DoesNotExist: report = page_mgr.import_org(row.fb_id) if report.model_instance: org = report.model_instance else: print 'WARNING: Organization FB import failed for entry %d (fbid %s)' % (i,str(row.fb_id)) if not org: org,created = Organization.objects.get_or_create(name=row.org) # import place if row.fb_id: try: place = ExternalPlaceSource.facebook.get(uid=row.fb_id) except ExternalPlaceSource.DoesNotExist: report = page_mgr.import_place(row.fb_id,import_owners=False) if report.model_instance: place = report.model_instance if not place.owner: # no owner is created automatically, so set it if not created place.owner = org place.save() else: print 'WARNING: Place FB import failed for entry %d (fbid %s)' % (i,str(row.fb_id)) if not place: place,created = Place.objects.get_or_create(name=row.place,location=location,owner=org) if row.url: PlaceMeta.objects.create(place=place,meta_key='url',meta_value=row.url) if not diff_org: # also save the url as the org's url if they're the same org.url = row.url org.save() if row.phone: PlaceMeta.objects.create(place=place,meta_key='phone',meta_value=row.phone) print 'Imported %s' % row.place try: print ' (linked to FB page %s)' % ExternalPlaceSource.facebook.get(place=place).uid except ExternalPlaceSource.DoesNotExist: pass # store tags from Google Place lookup if location and \ location.latitude is not None and location.longitude is not None: coords = (location.latitude,location.longitude) radius = 1000 else: coords = (40.4425,-79.9575) radius = 5000 response = gplaces_client.search_request(coords,radius,keyword=row.place) if len(response) > 0 and 'reference' in response[0]: details = gplaces_client.details_request(response[0]['reference']) all_tags = set() for typ in details.get('types',[]): if typ in gplaces_category_map: all_tags.update(gplaces_category_map[typ]) else: print 'WARNING: Unknown Google Places type: "%s"' % typ if len(all_tags) > 0: print ' Tags:', for t in all_tags: print '%s,' % t, print gp_hits += 1 else: print ' WARNING: Failure querying Google Places for "%s" within %dm of (%f,%f)' % (row.place,radius,coords[0],coords[1]) gp_misses += 1 print gp_hits, gp_misses