Ejemplo n.º 1
0
    def verify(self, influencer):
        if not influencer.bloglovin_url:
            log.warn('No bloglogin_url')
            return []
        bloglovin_url = influencer.bloglovin_url.split()[0]
        r = requests.get(bloglovin_url)
        tree = lxml.html.fromstring(r.text)

        name_el = tree.xpath('//div[@class="blog-info"]/h1[@class="name"]')[0]
        name = name_el.text
        log.info('Blogger name from bloglovin: %r', name)

        url_el = tree.xpath('//div[@class="blog-info"]/div[@class="url"]/a')[0]
        url = url_el.text
        if not url.startswith('http'):
            url = 'http://%s' % url
        log.info('Blog url: %r', url)

        if platformutils.url_to_handle(url) == platformutils.url_to_handle(
                influencer.blog_url):
            log.info('Urls match')
            if textutils.same_word_sets(name, influencer.name):
                return ['name']
        else:
            log.warn('Urls do not match')
Ejemplo n.º 2
0
def _redirect_to_the_other_detected(source_url, possible_target_url):
    from platformdatafetcher import platformutils

    try:
        r = requests.get(source_url, timeout=10)
    except:
        return False
    return platformutils.url_to_handle(r.url) == \
           platformutils.url_to_handle(possible_target_url)
Ejemplo n.º 3
0
def find_all_urls(s, exclude_imgs=True):
    """Looks also for urls without a protocol (http/https).
    """
    from platformdatafetcher import platformutils

    urls = []
    v_urls = set()

    for u in _url_re.findall(s):
        if exclude_imgs and u.endswith(_IMG_EXTS):
            continue
        if platformutils.url_to_handle(u) not in v_urls:
            v_urls.add(platformutils.url_to_handle(u))
            urls.append(u)

    for candidate in _url_no_protocol_re.findall(s):
        netloc = _netloc_for_url_candidate(candidate)
        if netloc is None:
            continue
        if exclude_imgs and candidate.endswith(_IMG_EXTS):
            continue
        # Skip texts like: posted..It or ...ok
        if '..' in candidate or '(' in candidate or '@' in candidate:
            continue
        if '.' not in netloc:
            continue
        if candidate.startswith('//'):
            continue
        root_domain = netloc.split('.')[-1]
        if not 2 <= len(root_domain) <= 4:
            continue
        if any(c.isdigit() for c in root_domain):
            continue
        # Skip texts like posted.Are - look at letter case
        if root_domain[0].isupper() and root_domain[1:].islower():
            continue

        if platformutils.url_to_handle(candidate) in v_urls:
            continue
        v_urls.add(platformutils.url_to_handle(candidate))
        urls.append('http://' + candidate)
    return urls
Ejemplo n.º 4
0
 def check_platforms(self):
     res = self._init_res.copy()
     for eh in self.ehs_q.filter(
             field__in=models.Influencer.platform_name_to_field.values()):
         if self._val_empty(
                 eh.prev_value) and not self._val_empty(eh.curr_value):
             self._missing(eh, res)
         elif not self._val_empty(eh.prev_value) and not self._val_empty(
                 eh.curr_value):
             urls_prev = eh.prev_value.split()
             urls_curr = eh.curr_value.split()
             urls_prev = [platformutils.url_to_handle(u) for u in urls_prev]
             urls_curr = [platformutils.url_to_handle(u) for u in urls_curr]
             log.info('Urls prev: %r, Urls curr: %r', urls_prev, urls_curr)
             if set(urls_prev) != set(urls_curr):
                 self._incorrect(eh, res)
             else:
                 log.warn('Urls are the same but have different format')
         else:
             self._unknown(eh, res)
         res['processed'] += 1
     return res
Ejemplo n.º 5
0
def find_matching_influencer_for_platform_url(url):
    """
    Helper method to find an influencer that has a link to the url in one of
    it's platform objects
    """
    found_infs = set()
    handle = platformutils.url_to_handle(url)

    # If we have handle as a bare domain name as social url or url shortener,
    # we should skip it
    if handle in [
            'facebook.com',
            'pinterest.com',
            'youtube.com',
            'instagram.com',
            'twitter.com',
            't.co',
    ]:
        log.info(('Generic social url domain found: %r, '
                  'skipping search for matching influencers.'), handle)
        return found_infs

    # TODO: when we define Platform unique fields, make filtering using them
    # instead of url__contans and chunks below
    possible_matched_platforms = Platform.objects.filter(
        url__contains=handle,
        influencer__source__isnull=False,
        influencer__blog_url__isnull=False).exclude(url_not_found=True)
    log.info('Platforms found for %r: %s', url,
             len(possible_matched_platforms))
    for platform in possible_matched_platforms:
        platform_url = platform.url
        chunks = platform_url.split(handle)
        log.info('checking: \'%s\' vs \'%s\'', handle, platform_url)
        if len(chunks) > 0 and (len(chunks[-1]) == 0
                                or not chunks[-1][0].isalnum()):
            log.info("Platforms detected for this url [%s] [%s] %r %r", handle,
                     platform_url, platform, platform.influencer)
            found_infs.add(platform.influencer)

    return found_infs
Ejemplo n.º 6
0
def restore_influencers_urls(profiles_ids, to_save=False):
    """
    Here we take all new Influencers discovered via Instagram for 'singapore',
    check their platforms. And if platform occurs set its platform url to Influencer's corresponding *_url field
    :return:
    """
    from debra import admin_helpers
    print('Got %s profiles to check and correct...' % len(profiles_ids))

    profiles_with_conflicting_influencer = []

    def handle_field(inf, field_name, current_value, new_value):
        if current_value is None:
            print('Field %s is None, so restoring it to %s ... ' %
                  (field_name, new_value))
        else:
            print(
                'Field %s has a not empty value of %s, overwriting it with %s '
                % (field_name, current_value, new_value))
        setattr(inf, field_name, new_value)

    for idx, pid in enumerate(profiles_ids):
        profile = InstagramProfile.objects.get(id=pid)
        print("===========================================")
        print("%s. Profile id %s %s" % (idx, profile.id, profile.username))
        inf = profile.discovered_influencer

        print("Influencer id %s and %s" % (inf.id, inf))
        print("Getting platforms... ")
        platforms = Platform.objects.filter(
            influencer=inf,
            autovalidated=True,
            platform_name__in=Platform.SOCIAL_PLATFORMS_CRAWLED).exclude(
                url_not_found=True).order_by("platform_name")

        print('This influencer has %s social crawled platforms: %s' %
              (platforms.count(), [pl.platform_name for pl in platforms]))

        platform_names = [pl.platform_name for pl in platforms]
        if not 'Instagram' in platform_names:
            current_value = getattr(inf, 'insta_url')
            handle_field(inf, 'insta_url', current_value,
                         'http://instagram.com/' + profile.username)
        conflict_found = False
        for pl in platforms:
            field_name = Influencer.platform_name_to_field.get(
                pl.platform_name)
            if field_name is not None:
                current_value = getattr(inf, field_name)
                handle_field(inf, field_name, current_value, pl.url)
                # check if there is a conflict => meaning that the influencer we connected this profile to has
                # another instagram url which is also validated. So, we need to look at these a bit more
                if field_name == 'insta_url' and current_value:
                    u1 = platformutils.url_to_handle(current_value.lower())
                    u2 = platformutils.url_to_handle(pl.url.lower())
                    if u1 != u2:
                        profiles_with_conflicting_influencer.append(pid)
                        conflict_found = True
            else:
                print(
                    'Platform %s does not have a separate url field, skipping it.'
                    % pl.platform_name)

        if to_save and not conflict_found:
            print("Saving now")
            inf.save()
            admin_helpers.handle_social_handle_updates(inf, 'fb_url',
                                                       inf.fb_url)
            admin_helpers.handle_social_handle_updates(inf, 'pin_url',
                                                       inf.pin_url)
            admin_helpers.handle_social_handle_updates(inf, 'tw_url',
                                                       inf.tw_url)
            admin_helpers.handle_social_handle_updates(inf, 'insta_url',
                                                       inf.insta_url)
            admin_helpers.handle_social_handle_updates(inf, 'youtube_url',
                                                       inf.youtube_url)

        if to_save and conflict_found:
            profile.discovered_influencer = None
            profile.save()

    return profiles_with_conflicting_influencer
Ejemplo n.º 7
0
def is_blacklisted(url):
    if '#!' in url:
        return False
    return platformutils.url_to_handle(url) in BLACKLISTED_MAINSTREAM_HANDLES
Ejemplo n.º 8
0
from debra import models
from debra import constants
from debra import db_util
from django.db.models import Q
from hanna import import_from_blog_post

from platformdatafetcher import platformutils
from xpathscraper import utils
from xpathscraper import textutils


log = logging.getLogger('platformdatafetcher.invariants')


BLACKLISTED_MAINSTREAM_HANDLES = {platformutils.url_to_handle(u)
                                  for u in import_from_blog_post.exclude_domains}


def append_social_urls_to_blacklist_handles():
    """
    Here, we extend the list of bad urls by creating fake urls, such as "twitter.com/pinterest". This is an invalid
    url and should be caught. So, here, we create such fake urls for each platform that we crawl.
    """
    social_platform_names = models.Platform.SOCIAL_PLATFORMS_CRAWLED
    new_blacklist = []
    for s1 in social_platform_names:
        u = s1.lower() + '.com'
        for s2 in social_platform_names:
            if s1 == s2:
                continue
Ejemplo n.º 9
0
def _do_import_from_content(content,
                            opr,
                            to_save,
                            blacklisted_domains=BLACKLISTED_DOMAINS):
    """
    This function creates new platforms from content provided by searching for urls
    (except those given in blacklisted_domains).

    Limitation: it works only for building new 'blog' platforms, and doesn't work for creating new social platforms
    """
    if not content:
        log.warn('No content, doing nothing')
        return
    urls = contentfiltering.find_all_urls(content)
    log.info('Found %d urls: %r', len(urls), urls)
    platforms = []
    for url in urls:
        log.info('Oring url: %r', url)
        try:
            url = utils.resolve_http_redirect(url)
        except:
            log.exception('While resolve_http_redirect, skipping')
            continue
        log.info('Redirected url: %r', url)
        vurl = platformutils.url_to_handle(url)
        if not vurl:
            log.info('No handle computed from url %r, skipping', url)
            continue
        domain = utils.domain_from_url(vurl)
        if domain in blacklisted_domains:
            log.info('Domain %r is blacklisted', domain)
            continue
        blog_url = utils.url_without_path(url)
        if domain.endswith('.wordpress.com'):
            platforms.append(
                models.Platform(platform_name='Wordpress', url=blog_url))
        elif domain.endswith('.blogspot.com'):
            platforms.append(
                models.Platform(platform_name='Blogspot', url=blog_url))
        else:
            content = xutils.fetch_url(blog_url)
            if content:
                discovered_pname = xutils.contains_blog_metatags(content)
                if discovered_pname:
                    platforms.append(
                        models.Platform(platform_name=discovered_pname,
                                        url=blog_url))
                    continue
            platforms.append(
                models.Platform(platform_name='Custom', url=blog_url))

    influencers = []
    influencers_created = []
    for plat in platforms:
        inf, inf_created = helpers.get_or_create_influencer(
            plat.url, 'comments_content_import', to_save)
        if not inf:
            log.warn(
                'Skipping url %r because influencer with this url is blacklisted',
                plat.url)
            continue
        plat.influencer = inf
        influencers.append(inf)
        if inf_created:
            influencers_created.append(inf)

    if opr:
        opr.data = {
            'influencer_ids': [influencer.id for influencer in influencers],
            'influencer_created_ids':
            [influencer.id for influencer in influencers_created],
            'influencer_blog_urls':
            [influencer.blog_url for influencer in influencers],
        }

    log.info('Platforms from content: %r', platforms)
    if to_save:
        for plat in platforms:
            # influencer of None means we got a blacklisted influencer
            # when we searched by URL.
            if plat.influencer is not None:
                plat.save()

    return platforms
Ejemplo n.º 10
0
    def fetch_posts(self,
                    max_pages=5,
                    pis_max_pages=5,
                    include_only_post_urls=None,
                    force_fetch_more=False):
        """
        @param force_fetch_more if set to True, the caller is responsible for the termination and setting appropriate
               limit on max_pages
        """
        self._assure_valid_platform_url()

        # Setting platform's last_fetched date
        if self.platform is not None:
            self.platform.last_fetched = datetime.datetime.now()
            self.platform.save()

        res = []
        if include_only_post_urls is not None:
            # Normalize urls
            include_only_post_urls = {
                platformutils.url_to_handle(u)
                for u in include_only_post_urls
            }
        self.platform.inc_api_calls()

        stop_processing = False

        for page_no, entries in enumerate(fetch_entries(self.platform)):
            entries_skipped = 0
            if self.test_run:
                entries = entries[:2]

            # Flag raised from inner loop - stop fetching new entries
            if stop_processing:
                break

            for e in entries:
                if not self.policy.should_continue_fetching(
                        self) and not force_fetch_more:
                    stop_processing = True
                    break

                # date can be present in multiple places
                if not hasattr(e, 'published_parsed') and not hasattr(
                        e, 'updated_parsed'):
                    log.error('No date in feed entry %r', e)
                    continue
                if include_only_post_urls is not None and \
                        platformutils.url_to_handle(e['link']) not in include_only_post_urls:
                    log.info('Post url %r not in included urls', e['link'])
                    continue

                post_url = e['link']
                previously_saved = list(
                    models.Posts.objects.filter(url=post_url,
                                                platform=self.platform))
                if previously_saved:
                    if self.should_update_old_posts():
                        log.debug('Updating existing post for url {}'.format(
                            post_url))
                        post = previously_saved[0]
                    else:
                        self._inc('posts_skipped')
                        entries_skipped += 1
                        log.debug(
                            'Skipping already saved post with url {}'.format(
                                post_url))
                        if not self.test_run:
                            continue
                else:
                    log.debug('Creating new post for url {}'.format(post_url))
                    post = models.Posts()

                post.influencer = self.platform.influencer
                post.show_on_search = self.platform.influencer.show_on_search
                post.platform = self.platform
                post.platform_name = self.platform.platform_name
                post.title = e['title']
                post.url = e['link']
                post.content = self._content_from_entry(e)
                time_struct = e.published_parsed if hasattr(
                    e, 'published_parsed') else e.updated_parsed
                post.create_date = utils.from_struct_to_dt(time_struct)
                ## look for comment number in the body so that we can get this information even if the feed
                ## limits the comments shown in the feed
                ## TODO: we may need to expand the filter to include more types. Might need to check more blogs.
                if 'slash_comments' in e or 'slash:comments' in e:
                    log.debug('Found number of comments: %s',
                              e['slash_comments'])
                    post.ext_num_comments = int(e['slash_comments'])

                api_data = {}
                for k in ('id', 'wfw_commentrss', 'commentrss', 'commentsrss'):
                    if e.get(k):
                        api_data[k] = e[k]
                post.api_id = json.dumps(api_data)

                self.save_post(post)
                res.append(post)

                pis = self.fetch_post_interactions_extra(
                    [post], max_pages=pis_max_pages)
                if self.test_run:
                    res += pis

            if not self.test_run and entries_skipped == len(entries):
                log.debug(
                    'All entries skipped, not fetching more (total entries: %s)'
                    % len(entries))
                break
            if max_pages is not None and page_no >= max_pages:
                log.debug('Max pages reached')
                break

        return res