def verify(self, influencer): if not influencer.bloglovin_url: log.warn('No bloglogin_url') return [] bloglovin_url = influencer.bloglovin_url.split()[0] r = requests.get(bloglovin_url) tree = lxml.html.fromstring(r.text) name_el = tree.xpath('//div[@class="blog-info"]/h1[@class="name"]')[0] name = name_el.text log.info('Blogger name from bloglovin: %r', name) url_el = tree.xpath('//div[@class="blog-info"]/div[@class="url"]/a')[0] url = url_el.text if not url.startswith('http'): url = 'http://%s' % url log.info('Blog url: %r', url) if platformutils.url_to_handle(url) == platformutils.url_to_handle( influencer.blog_url): log.info('Urls match') if textutils.same_word_sets(name, influencer.name): return ['name'] else: log.warn('Urls do not match')
def _redirect_to_the_other_detected(source_url, possible_target_url): from platformdatafetcher import platformutils try: r = requests.get(source_url, timeout=10) except: return False return platformutils.url_to_handle(r.url) == \ platformutils.url_to_handle(possible_target_url)
def find_all_urls(s, exclude_imgs=True): """Looks also for urls without a protocol (http/https). """ from platformdatafetcher import platformutils urls = [] v_urls = set() for u in _url_re.findall(s): if exclude_imgs and u.endswith(_IMG_EXTS): continue if platformutils.url_to_handle(u) not in v_urls: v_urls.add(platformutils.url_to_handle(u)) urls.append(u) for candidate in _url_no_protocol_re.findall(s): netloc = _netloc_for_url_candidate(candidate) if netloc is None: continue if exclude_imgs and candidate.endswith(_IMG_EXTS): continue # Skip texts like: posted..It or ...ok if '..' in candidate or '(' in candidate or '@' in candidate: continue if '.' not in netloc: continue if candidate.startswith('//'): continue root_domain = netloc.split('.')[-1] if not 2 <= len(root_domain) <= 4: continue if any(c.isdigit() for c in root_domain): continue # Skip texts like posted.Are - look at letter case if root_domain[0].isupper() and root_domain[1:].islower(): continue if platformutils.url_to_handle(candidate) in v_urls: continue v_urls.add(platformutils.url_to_handle(candidate)) urls.append('http://' + candidate) return urls
def check_platforms(self): res = self._init_res.copy() for eh in self.ehs_q.filter( field__in=models.Influencer.platform_name_to_field.values()): if self._val_empty( eh.prev_value) and not self._val_empty(eh.curr_value): self._missing(eh, res) elif not self._val_empty(eh.prev_value) and not self._val_empty( eh.curr_value): urls_prev = eh.prev_value.split() urls_curr = eh.curr_value.split() urls_prev = [platformutils.url_to_handle(u) for u in urls_prev] urls_curr = [platformutils.url_to_handle(u) for u in urls_curr] log.info('Urls prev: %r, Urls curr: %r', urls_prev, urls_curr) if set(urls_prev) != set(urls_curr): self._incorrect(eh, res) else: log.warn('Urls are the same but have different format') else: self._unknown(eh, res) res['processed'] += 1 return res
def find_matching_influencer_for_platform_url(url): """ Helper method to find an influencer that has a link to the url in one of it's platform objects """ found_infs = set() handle = platformutils.url_to_handle(url) # If we have handle as a bare domain name as social url or url shortener, # we should skip it if handle in [ 'facebook.com', 'pinterest.com', 'youtube.com', 'instagram.com', 'twitter.com', 't.co', ]: log.info(('Generic social url domain found: %r, ' 'skipping search for matching influencers.'), handle) return found_infs # TODO: when we define Platform unique fields, make filtering using them # instead of url__contans and chunks below possible_matched_platforms = Platform.objects.filter( url__contains=handle, influencer__source__isnull=False, influencer__blog_url__isnull=False).exclude(url_not_found=True) log.info('Platforms found for %r: %s', url, len(possible_matched_platforms)) for platform in possible_matched_platforms: platform_url = platform.url chunks = platform_url.split(handle) log.info('checking: \'%s\' vs \'%s\'', handle, platform_url) if len(chunks) > 0 and (len(chunks[-1]) == 0 or not chunks[-1][0].isalnum()): log.info("Platforms detected for this url [%s] [%s] %r %r", handle, platform_url, platform, platform.influencer) found_infs.add(platform.influencer) return found_infs
def restore_influencers_urls(profiles_ids, to_save=False): """ Here we take all new Influencers discovered via Instagram for 'singapore', check their platforms. And if platform occurs set its platform url to Influencer's corresponding *_url field :return: """ from debra import admin_helpers print('Got %s profiles to check and correct...' % len(profiles_ids)) profiles_with_conflicting_influencer = [] def handle_field(inf, field_name, current_value, new_value): if current_value is None: print('Field %s is None, so restoring it to %s ... ' % (field_name, new_value)) else: print( 'Field %s has a not empty value of %s, overwriting it with %s ' % (field_name, current_value, new_value)) setattr(inf, field_name, new_value) for idx, pid in enumerate(profiles_ids): profile = InstagramProfile.objects.get(id=pid) print("===========================================") print("%s. Profile id %s %s" % (idx, profile.id, profile.username)) inf = profile.discovered_influencer print("Influencer id %s and %s" % (inf.id, inf)) print("Getting platforms... ") platforms = Platform.objects.filter( influencer=inf, autovalidated=True, platform_name__in=Platform.SOCIAL_PLATFORMS_CRAWLED).exclude( url_not_found=True).order_by("platform_name") print('This influencer has %s social crawled platforms: %s' % (platforms.count(), [pl.platform_name for pl in platforms])) platform_names = [pl.platform_name for pl in platforms] if not 'Instagram' in platform_names: current_value = getattr(inf, 'insta_url') handle_field(inf, 'insta_url', current_value, 'http://instagram.com/' + profile.username) conflict_found = False for pl in platforms: field_name = Influencer.platform_name_to_field.get( pl.platform_name) if field_name is not None: current_value = getattr(inf, field_name) handle_field(inf, field_name, current_value, pl.url) # check if there is a conflict => meaning that the influencer we connected this profile to has # another instagram url which is also validated. So, we need to look at these a bit more if field_name == 'insta_url' and current_value: u1 = platformutils.url_to_handle(current_value.lower()) u2 = platformutils.url_to_handle(pl.url.lower()) if u1 != u2: profiles_with_conflicting_influencer.append(pid) conflict_found = True else: print( 'Platform %s does not have a separate url field, skipping it.' % pl.platform_name) if to_save and not conflict_found: print("Saving now") inf.save() admin_helpers.handle_social_handle_updates(inf, 'fb_url', inf.fb_url) admin_helpers.handle_social_handle_updates(inf, 'pin_url', inf.pin_url) admin_helpers.handle_social_handle_updates(inf, 'tw_url', inf.tw_url) admin_helpers.handle_social_handle_updates(inf, 'insta_url', inf.insta_url) admin_helpers.handle_social_handle_updates(inf, 'youtube_url', inf.youtube_url) if to_save and conflict_found: profile.discovered_influencer = None profile.save() return profiles_with_conflicting_influencer
def is_blacklisted(url): if '#!' in url: return False return platformutils.url_to_handle(url) in BLACKLISTED_MAINSTREAM_HANDLES
from debra import models from debra import constants from debra import db_util from django.db.models import Q from hanna import import_from_blog_post from platformdatafetcher import platformutils from xpathscraper import utils from xpathscraper import textutils log = logging.getLogger('platformdatafetcher.invariants') BLACKLISTED_MAINSTREAM_HANDLES = {platformutils.url_to_handle(u) for u in import_from_blog_post.exclude_domains} def append_social_urls_to_blacklist_handles(): """ Here, we extend the list of bad urls by creating fake urls, such as "twitter.com/pinterest". This is an invalid url and should be caught. So, here, we create such fake urls for each platform that we crawl. """ social_platform_names = models.Platform.SOCIAL_PLATFORMS_CRAWLED new_blacklist = [] for s1 in social_platform_names: u = s1.lower() + '.com' for s2 in social_platform_names: if s1 == s2: continue
def _do_import_from_content(content, opr, to_save, blacklisted_domains=BLACKLISTED_DOMAINS): """ This function creates new platforms from content provided by searching for urls (except those given in blacklisted_domains). Limitation: it works only for building new 'blog' platforms, and doesn't work for creating new social platforms """ if not content: log.warn('No content, doing nothing') return urls = contentfiltering.find_all_urls(content) log.info('Found %d urls: %r', len(urls), urls) platforms = [] for url in urls: log.info('Oring url: %r', url) try: url = utils.resolve_http_redirect(url) except: log.exception('While resolve_http_redirect, skipping') continue log.info('Redirected url: %r', url) vurl = platformutils.url_to_handle(url) if not vurl: log.info('No handle computed from url %r, skipping', url) continue domain = utils.domain_from_url(vurl) if domain in blacklisted_domains: log.info('Domain %r is blacklisted', domain) continue blog_url = utils.url_without_path(url) if domain.endswith('.wordpress.com'): platforms.append( models.Platform(platform_name='Wordpress', url=blog_url)) elif domain.endswith('.blogspot.com'): platforms.append( models.Platform(platform_name='Blogspot', url=blog_url)) else: content = xutils.fetch_url(blog_url) if content: discovered_pname = xutils.contains_blog_metatags(content) if discovered_pname: platforms.append( models.Platform(platform_name=discovered_pname, url=blog_url)) continue platforms.append( models.Platform(platform_name='Custom', url=blog_url)) influencers = [] influencers_created = [] for plat in platforms: inf, inf_created = helpers.get_or_create_influencer( plat.url, 'comments_content_import', to_save) if not inf: log.warn( 'Skipping url %r because influencer with this url is blacklisted', plat.url) continue plat.influencer = inf influencers.append(inf) if inf_created: influencers_created.append(inf) if opr: opr.data = { 'influencer_ids': [influencer.id for influencer in influencers], 'influencer_created_ids': [influencer.id for influencer in influencers_created], 'influencer_blog_urls': [influencer.blog_url for influencer in influencers], } log.info('Platforms from content: %r', platforms) if to_save: for plat in platforms: # influencer of None means we got a blacklisted influencer # when we searched by URL. if plat.influencer is not None: plat.save() return platforms
def fetch_posts(self, max_pages=5, pis_max_pages=5, include_only_post_urls=None, force_fetch_more=False): """ @param force_fetch_more if set to True, the caller is responsible for the termination and setting appropriate limit on max_pages """ self._assure_valid_platform_url() # Setting platform's last_fetched date if self.platform is not None: self.platform.last_fetched = datetime.datetime.now() self.platform.save() res = [] if include_only_post_urls is not None: # Normalize urls include_only_post_urls = { platformutils.url_to_handle(u) for u in include_only_post_urls } self.platform.inc_api_calls() stop_processing = False for page_no, entries in enumerate(fetch_entries(self.platform)): entries_skipped = 0 if self.test_run: entries = entries[:2] # Flag raised from inner loop - stop fetching new entries if stop_processing: break for e in entries: if not self.policy.should_continue_fetching( self) and not force_fetch_more: stop_processing = True break # date can be present in multiple places if not hasattr(e, 'published_parsed') and not hasattr( e, 'updated_parsed'): log.error('No date in feed entry %r', e) continue if include_only_post_urls is not None and \ platformutils.url_to_handle(e['link']) not in include_only_post_urls: log.info('Post url %r not in included urls', e['link']) continue post_url = e['link'] previously_saved = list( models.Posts.objects.filter(url=post_url, platform=self.platform)) if previously_saved: if self.should_update_old_posts(): log.debug('Updating existing post for url {}'.format( post_url)) post = previously_saved[0] else: self._inc('posts_skipped') entries_skipped += 1 log.debug( 'Skipping already saved post with url {}'.format( post_url)) if not self.test_run: continue else: log.debug('Creating new post for url {}'.format(post_url)) post = models.Posts() post.influencer = self.platform.influencer post.show_on_search = self.platform.influencer.show_on_search post.platform = self.platform post.platform_name = self.platform.platform_name post.title = e['title'] post.url = e['link'] post.content = self._content_from_entry(e) time_struct = e.published_parsed if hasattr( e, 'published_parsed') else e.updated_parsed post.create_date = utils.from_struct_to_dt(time_struct) ## look for comment number in the body so that we can get this information even if the feed ## limits the comments shown in the feed ## TODO: we may need to expand the filter to include more types. Might need to check more blogs. if 'slash_comments' in e or 'slash:comments' in e: log.debug('Found number of comments: %s', e['slash_comments']) post.ext_num_comments = int(e['slash_comments']) api_data = {} for k in ('id', 'wfw_commentrss', 'commentrss', 'commentsrss'): if e.get(k): api_data[k] = e[k] post.api_id = json.dumps(api_data) self.save_post(post) res.append(post) pis = self.fetch_post_interactions_extra( [post], max_pages=pis_max_pages) if self.test_run: res += pis if not self.test_run and entries_skipped == len(entries): log.debug( 'All entries skipped, not fetching more (total entries: %s)' % len(entries)) break if max_pages is not None and page_no >= max_pages: log.debug('Max pages reached') break return res