Esempio n. 1
0
    def test_import_no_owner(self):
        '''Tests the importing of a batch of FB pages as Places without owner importing disabled.'''
        no_owner_stored = '84714961156'  # sqaure cafe (org and place not in fixture)
        owner_stored = '50141015898'  # voluto coffee (org in fixture but not place)

        before_orgs = list(Organization.objects.all())
        before_records = list(FacebookOrgRecord.objects.all())

        mgr = PageImportManager()

        # ensure no org is created
        result = mgr.import_place(no_owner_stored, import_owners=False)
        self.assertIsNone(result.model_instance.owner)

        # ensure the existing org is found, even without import
        result = mgr.import_place(owner_stored, import_owners=False)
        self.assertIsNotNone(result.model_instance)
        self.assertEquals(
            result.model_instance.owner,
            FacebookOrgRecord.objects.get(fb_id=owner_stored).organization)

        # double check that the Organization and FacebookOrgRecord tables weren't touched
        self.assertEquals(before_orgs, list(Organization.objects.all()))
        self.assertEquals(before_records,
                          list(FacebookOrgRecord.objects.all()))
def import_ids(page_ids):
    page_mgr = PageImportManager()
    importlog.info('Pulling %d pages from Facebook' % len(page_ids))

    importlog.info('Importing Organizations into database from %d pages' %
                   len(page_ids))
    page_infos = page_mgr.pull_page_info(page_ids)  # cache pages

    importlog.info('Refresing cached FB page infos')
    for pid, info in zip(page_ids, page_infos):
        if not isinstance(info, dict):
            importlog.info('Cannot store page info JSON for fb id %s' %
                           str(pid))
            continue
        info.pop('metadata', None)  # don't need to store metadata if it exists
        record, _ = FacebookPage.objects.get_or_create(fb_id=pid)
        record.pageinfo_json = json.dumps(info)
        record.save()

    import_count = 0
    for pid in page_ids:
        report = page_mgr.import_org(pid)
        if report.notices:
            for notice in report.notices:
                if isinstance(notice, PageImportReport.ModelInstanceExists):
                    importlog.info('%s: Record for Organization exists' %
                                   report.page_id)
                else:
                    importlog.error('%s: %s' %
                                    (report.page_id, unicode(notice)))
        else:
            importlog.info('%s: Imported successfully as %s (id=%d)' %
                           (report.page_id, unicode(report.model_instance),
                            report.model_instance.id))
            import_count += 1
    importlog.info('%d new Organizations imported' % import_count)

    reports = page_mgr.import_place(page_ids,
                                    import_owners=True)  # generator object
    import_count = 0
    for pid in page_ids:
        report = page_mgr.import_place(pid)
        if report.notices:
            for notice in report.notices:
                if isinstance(notice, PageImportReport.ModelInstanceExists):
                    importlog.info('%s: Record for Place exists' %
                                   report.page_id)
                else:
                    importlog.error('%s: %s' %
                                    (report.page_id, unicode(notice)))
        else:
            importlog.info('%s: Imported successfully as %s (id=%d)' %
                           (report.page_id, unicode(report.model_instance),
                            report.model_instance.id))
            import_count += 1
    importlog.info('%d new Places imported' % import_count)
Esempio n. 3
0
def import_ids(page_ids):
    page_mgr = PageImportManager()
    importlog.info('Pulling %d pages from Facebook' % len(page_ids))
    
    importlog.info('Importing Organizations into database from %d pages' % len(page_ids))
    page_infos = page_mgr.pull_page_info(page_ids)    # cache pages
    
    importlog.info('Refresing cached FB page infos')
    for pid,info in zip(page_ids,page_infos):
        if not isinstance(info,dict):
            importlog.info('Cannot store page info JSON for fb id %s' % str(pid))
            continue
        info.pop('metadata',None)       # don't need to store metadata if it exists
        record, _ = FacebookPage.objects.get_or_create(fb_id=pid)
        record.pageinfo_json = json.dumps(info)
        record.save()

    import_count = 0
    for pid in page_ids:
        report = page_mgr.import_org(pid)
        if report.notices:
            for notice in report.notices:
                if isinstance(notice,PageImportReport.ModelInstanceExists):
                    importlog.info('%s: Record for Organization exists' % report.page_id)
                else:
                    importlog.error('%s: %s' % (report.page_id,unicode(notice)))
        else:
            importlog.info('%s: Imported successfully as %s (id=%d)' % (report.page_id,
                                                                        unicode(report.model_instance),
                                                                        report.model_instance.id))
            import_count += 1
    importlog.info('%d new Organizations imported' % import_count)

    reports = page_mgr.import_place(page_ids,import_owners=True)    # generator object
    import_count = 0
    for pid in page_ids:
        report = page_mgr.import_place(pid)
        if report.notices:
            for notice in report.notices:
                if isinstance(notice,PageImportReport.ModelInstanceExists):
                    importlog.info('%s: Record for Place exists' % report.page_id)
                else:
                    importlog.error('%s: %s' % (report.page_id,unicode(notice)))
        else:
            importlog.info('%s: Imported successfully as %s (id=%d)' % (report.page_id,
                                                                        unicode(report.model_instance),
                                                                        report.model_instance.id))
            import_count += 1
    importlog.info('%d new Places imported' % import_count)
Esempio n. 4
0
    def test_import(self):
        '''Tests the importing of a batch of FB pages as Places'''
        mgr = PageImportManager()
        pid_notice_pairs = [
            ('84714961156', None),  # Square Cafe
            ('139288502700', TypeError),  # Pgh Marathon (no location)
            ('291107654260858', TypeError),  # event page
            ('9423481220941280', FacebookAPIError),  # bogus id
            ('53379078585', PageImportReport.ModelInstanceExists),  # big dog
        ]
        random.shuffle(pid_notice_pairs)

        # grab original FB records from any pages that already exist
        original_fb_records = {}
        for pid, notice in pid_notice_pairs:
            if notice is PageImportReport.ModelInstanceExists:
                original_fb_records[pid] = ExternalPlaceSource.facebook.get(
                    uid=pid)
        pids = [pair[0] for pair in pid_notice_pairs]

        # run insertion code
        mgr.pull_page_info(pids)  # cache pages
        results = [mgr.import_place(pid) for pid in pids]
        self.assertEquals([result.page_id for result in results],
                          [pid for pid, _ in pid_notice_pairs],
                          'non-parallel list of PageImportReports returned')

        for pair, result in zip(pid_notice_pairs, results):
            pid, expected_notice = pair
            if not expected_notice:
                self.assertEquals([], result.notices)
                # assert a new model instance was created and it's FB record matches what was returned
                try:
                    place = ExternalPlaceSource.facebook.get(uid=pid).place
                except ExternalPlaceSource.DoesNotExist:
                    self.fail('No place record for fbid %s' % pid)
                if place != result.model_instance:
                    self.fail('No place created for fbid %s' % pid)
            else:
                # assert no model instance is returned
                self.assertIsNone(result.model_instance)
                # assert expected notice was generated
                self.assertEquals(len(result.notices), 1)
                self.assertTrue(
                    isinstance(result.notices[0], expected_notice),
                    'Expecting notice %s from importing fb page %s' %
                    (str(expected_notice), pid))

                # if notice was a ModelInstanceExists, be sure the original record wasn't touched
                if expected_notice is PageImportReport.ModelInstanceExists:
                    self.assertEquals(
                        original_fb_records[pid],
                        ExternalPlaceSource.facebook.get(uid=pid))
                # otherwise, make sure no record was created at all
                else:
                    with self.assertRaises(ExternalPlaceSource.DoesNotExist):
                        ExternalPlaceSource.facebook.get(uid=pid)
Esempio n. 5
0
    def test_import(self):
        """Tests the importing of a batch of FB pages as Places"""
        mgr = PageImportManager()
        pid_notice_pairs = [
            ("84714961156", None),  # Square Cafe
            ("139288502700", TypeError),  # Pgh Marathon (no location)
            ("291107654260858", TypeError),  # event page
            ("9423481220941280", FacebookAPIError),  # bogus id
            ("53379078585", PageImportReport.ModelInstanceExists),  # big dog
        ]
        random.shuffle(pid_notice_pairs)

        # grab original FB records from any pages that already exist
        original_fb_records = {}
        for pid, notice in pid_notice_pairs:
            if notice is PageImportReport.ModelInstanceExists:
                original_fb_records[pid] = ExternalPlaceSource.facebook.get(uid=pid)
        pids = [pair[0] for pair in pid_notice_pairs]

        # run insertion code
        mgr.pull_page_info(pids)  # cache pages
        results = [mgr.import_place(pid) for pid in pids]
        self.assertEquals(
            [result.page_id for result in results],
            [pid for pid, _ in pid_notice_pairs],
            "non-parallel list of PageImportReports returned",
        )

        for pair, result in zip(pid_notice_pairs, results):
            pid, expected_notice = pair
            if not expected_notice:
                self.assertEquals([], result.notices)
                # assert a new model instance was created and it's FB record matches what was returned
                try:
                    place = ExternalPlaceSource.facebook.get(uid=pid).place
                except ExternalPlaceSource.DoesNotExist:
                    self.fail("No place record for fbid %s" % pid)
                if place != result.model_instance:
                    self.fail("No place created for fbid %s" % pid)
            else:
                # assert no model instance is returned
                self.assertIsNone(result.model_instance)
                # assert expected notice was generated
                self.assertEquals(len(result.notices), 1)
                self.assertTrue(
                    isinstance(result.notices[0], expected_notice),
                    "Expecting notice %s from importing fb page %s" % (str(expected_notice), pid),
                )

                # if notice was a ModelInstanceExists, be sure the original record wasn't touched
                if expected_notice is PageImportReport.ModelInstanceExists:
                    self.assertEquals(original_fb_records[pid], ExternalPlaceSource.facebook.get(uid=pid))
                # otherwise, make sure no record was created at all
                else:
                    with self.assertRaises(ExternalPlaceSource.DoesNotExist):
                        ExternalPlaceSource.facebook.get(uid=pid)
Esempio n. 6
0
    def test_import_no_owner(self):
        """Tests the importing of a batch of FB pages as Places without owner importing disabled."""
        no_owner_stored = "84714961156"  # sqaure cafe (org and place not in fixture)
        owner_stored = "50141015898"  # voluto coffee (org in fixture but not place)

        before_orgs = list(Organization.objects.all())
        before_records = list(FacebookOrgRecord.objects.all())

        mgr = PageImportManager()

        # ensure no org is created
        result = mgr.import_place(no_owner_stored, import_owners=False)
        self.assertIsNone(result.model_instance.owner)

        # ensure the existing org is found, even without import
        result = mgr.import_place(owner_stored, import_owners=False)
        self.assertIsNotNone(result.model_instance)
        self.assertEquals(result.model_instance.owner, FacebookOrgRecord.objects.get(fb_id=owner_stored).organization)

        # double check that the Organization and FacebookOrgRecord tables weren't touched
        self.assertEquals(before_orgs, list(Organization.objects.all()))
        self.assertEquals(before_records, list(FacebookOrgRecord.objects.all()))
Esempio n. 7
0
def run():
    in_filename = os.path.join(os.path.dirname(__file__), 'obid.csv')

    #clear all tables
    Location.objects.all().delete()
    PlaceMeta.objects.all().delete()
    Place.objects.all().delete()
    Organization.objects.all().delete()
    ExternalPlaceSource.objects.all().delete()
    FacebookPage.objects.all().delete()
    FacebookOrgRecord.objects.all().delete()

    gplaces_category_map = load_category_map('google_places')
    gp_hits, gp_misses = 0, 0

    rows = OBIDRow.rows_from_csv(in_filename)

    # cycle through each row with a facebook reference and store a reference
    page_mgr = PageImportManager()
    fb_rows = [row for row in rows if row.fb_id]
    for row, info in zip(
            fb_rows, page_mgr.pull_page_info([row.fb_id for row in fb_rows])):
        if isinstance(info, dict):
            info.pop('metadata',
                     None)  # don't need to store metadata if it exists
            FacebookPage.objects.get_or_create(
                fb_id=info['id'],
                defaults=dict(pageinfo_json=json.dumps(info)))
            row.fb_id = info['id']  # ensure a numeric id
        else:
            print 'ERROR: Pulling fb page %s resulted in the following exception: "%s"' % (
                str(row.fb_id), str(info))
            row.fb_id = ''

    # cycle through all rows and store everything
    for i, row in enumerate(rows):
        if not row.place:
            print 'ERROR: no place for entry %d' % i

        # resolve the location
        location = resolve_location(
            Location(address=row.address, postcode='15213'))

        if location:
            # hack to get around Google Geocoding appending the unviersity onto all addresses
            if ( location.address.startswith('University') and not row.address.lower().startswith('univ') ) or \
               ( location.address.startswith('Carnegie Mellon') and row.address.lower().startswith('carnegie mellon') ):
                location.address = ','.join(location.address.split(',')[1:])

            try:
                # if exact match exists, use it instead of the newly found one
                location = Location.objects.get(address=location.address,
                                                postcode=location.postcode)
            except Location.DoesNotExist:
                location.save()
        else:
            print 'WARNING: Geocoding failed for entry %d ("%s")' % (i,
                                                                     row.place)

        diff_org = row.org != row.place
        org, place = None, None

        # import org
        # if the row has a fb id, we'll try to import the Org from Facebook
        # only import Org from Facebook if it's the same as the Place (fb id relates to place only)
        if row.fb_id and not diff_org:
            try:
                org = FacebookOrgRecord.objects.get(fb_id=row.fb_id)
            except FacebookOrgRecord.DoesNotExist:
                report = page_mgr.import_org(row.fb_id)
                if report.model_instance:
                    org = report.model_instance
                else:
                    print 'WARNING: Organization FB import failed for entry %d (fbid %s)' % (
                        i, str(row.fb_id))

        if not org:
            org, created = Organization.objects.get_or_create(name=row.org)

        # import place
        if row.fb_id:
            try:
                place = ExternalPlaceSource.facebook.get(uid=row.fb_id)
            except ExternalPlaceSource.DoesNotExist:
                report = page_mgr.import_place(row.fb_id, import_owners=False)
                if report.model_instance:
                    place = report.model_instance
                    if not place.owner:  # no owner is created automatically, so set it if not created
                        place.owner = org
                        place.save()
                else:
                    print 'WARNING: Place FB import failed for entry %d (fbid %s)' % (
                        i, str(row.fb_id))

        if not place:
            place, created = Place.objects.get_or_create(name=row.place,
                                                         location=location,
                                                         owner=org)

        if row.url:
            PlaceMeta.objects.create(place=place,
                                     meta_key='url',
                                     meta_value=row.url)
            if not diff_org:  # also save the url as the org's url if they're the same
                org.url = row.url
                org.save()

        if row.phone:
            PlaceMeta.objects.create(place=place,
                                     meta_key='phone',
                                     meta_value=row.phone)

        print 'Imported %s' % row.place
        try:
            print '  (linked to FB page %s)' % ExternalPlaceSource.facebook.get(
                place=place).uid
        except ExternalPlaceSource.DoesNotExist:
            pass

        # store tags from Google Place lookup
        if location and \
            location.latitude is not None and location.longitude is not None:
            coords = (location.latitude, location.longitude)
            radius = 1000
        else:
            coords = (40.4425, -79.9575)
            radius = 5000

        response = gplaces_client.search_request(coords,
                                                 radius,
                                                 keyword=row.place)

        if len(response) > 0 and 'reference' in response[0]:
            details = gplaces_client.details_request(response[0]['reference'])
            all_tags = set()
            for typ in details.get('types', []):
                if typ in gplaces_category_map:
                    all_tags.update(gplaces_category_map[typ])
                else:
                    print 'WARNING: Unknown Google Places type: "%s"' % typ
            if len(all_tags) > 0:
                print '  Tags:',
                for t in all_tags:
                    print '%s,' % t,
                print
            gp_hits += 1
        else:
            print '  WARNING: Failure querying Google Places for "%s" within %dm of (%f,%f)' % (
                row.place, radius, coords[0], coords[1])
            gp_misses += 1
    print gp_hits, gp_misses
Esempio n. 8
0
def run():
    in_filename = os.path.join(os.path.dirname(__file__),'obid.csv')

    #clear all tables
    Location.objects.all().delete()
    PlaceMeta.objects.all().delete()
    Place.objects.all().delete()
    Organization.objects.all().delete()
    ExternalPlaceSource.objects.all().delete()
    FacebookPage.objects.all().delete()
    FacebookOrgRecord.objects.all().delete()

    gplaces_category_map = load_category_map('google_places')
    gp_hits, gp_misses = 0,0

    rows = OBIDRow.rows_from_csv(in_filename)
    
    # cycle through each row with a facebook reference and store a reference
    page_mgr = PageImportManager()
    fb_rows = [row for row in rows if row.fb_id]
    for row,info in zip(fb_rows,page_mgr.pull_page_info([row.fb_id for row in fb_rows])):
        if isinstance(info,dict):
            info.pop('metadata',None)       # don't need to store metadata if it exists
            FacebookPage.objects.get_or_create(fb_id=info['id'],
                                        defaults=dict(pageinfo_json=json.dumps(info)))
            row.fb_id = info['id']  # ensure a numeric id
        else:
            print 'ERROR: Pulling fb page %s resulted in the following exception: "%s"' % (str(row.fb_id),str(info))
            row.fb_id = ''

    # cycle through all rows and store everything
    for i,row in enumerate(rows):
        if not row.place:
            print 'ERROR: no place for entry %d' % i
        
        # resolve the location
        location = resolve_location(Location(address=row.address,postcode='15213'))

        if location:
            # hack to get around Google Geocoding appending the unviersity onto all addresses
            if ( location.address.startswith('University') and not row.address.lower().startswith('univ') ) or \
               ( location.address.startswith('Carnegie Mellon') and row.address.lower().startswith('carnegie mellon') ):
               location.address = ','.join(location.address.split(',')[1:])

            try:
                # if exact match exists, use it instead of the newly found one
                location = Location.objects.get(address=location.address,postcode=location.postcode)
            except Location.DoesNotExist:
                location.save()
        else:
            print 'WARNING: Geocoding failed for entry %d ("%s")' % (i,row.place)

        diff_org = row.org != row.place
        org, place = None, None

        # import org
        # if the row has a fb id, we'll try to import the Org from Facebook
        # only import Org from Facebook if it's the same as the Place (fb id relates to place only)
        if row.fb_id and not diff_org:
            try:
                org = FacebookOrgRecord.objects.get(fb_id=row.fb_id)
            except FacebookOrgRecord.DoesNotExist:
                report = page_mgr.import_org(row.fb_id)
                if report.model_instance:
                    org = report.model_instance
                else:
                    print 'WARNING: Organization FB import failed for entry %d (fbid %s)' % (i,str(row.fb_id))

        if not org:
            org,created = Organization.objects.get_or_create(name=row.org)

        # import place
        if row.fb_id:
            try:
                place = ExternalPlaceSource.facebook.get(uid=row.fb_id)
            except ExternalPlaceSource.DoesNotExist:
                report = page_mgr.import_place(row.fb_id,import_owners=False)
                if report.model_instance:
                    place = report.model_instance
                    if not place.owner:     # no owner is created automatically, so set it if not created
                        place.owner = org
                        place.save()
                else:
                    print 'WARNING: Place FB import failed for entry %d (fbid %s)' % (i,str(row.fb_id))
        
        if not place:
            place,created = Place.objects.get_or_create(name=row.place,location=location,owner=org)
        
        if row.url:
            PlaceMeta.objects.create(place=place,meta_key='url',meta_value=row.url)
            if not diff_org:    # also save the url as the org's url if they're the same
                org.url = row.url
                org.save()

        if row.phone:
            PlaceMeta.objects.create(place=place,meta_key='phone',meta_value=row.phone)

        print 'Imported %s' % row.place
        try:
            print '  (linked to FB page %s)' % ExternalPlaceSource.facebook.get(place=place).uid
        except ExternalPlaceSource.DoesNotExist:
            pass

        # store tags from Google Place lookup
        if location and \
            location.latitude is not None and location.longitude is not None:
            coords = (location.latitude,location.longitude)
            radius = 1000
        else:
            coords = (40.4425,-79.9575)
            radius = 5000

        response = gplaces_client.search_request(coords,radius,keyword=row.place)

        if len(response) > 0 and 'reference' in response[0]:
            details = gplaces_client.details_request(response[0]['reference'])
            all_tags = set()
            for typ in details.get('types',[]):
                if typ in gplaces_category_map:
                    all_tags.update(gplaces_category_map[typ])
                else:
                    print 'WARNING: Unknown Google Places type: "%s"' % typ
            if len(all_tags) > 0:
                print '  Tags:',
                for t in all_tags:
                    print '%s,' % t,
                print
            gp_hits += 1
        else:
            print '  WARNING: Failure querying Google Places for "%s" within %dm of (%f,%f)' % (row.place,radius,coords[0],coords[1])
            gp_misses += 1
    print gp_hits, gp_misses