def petester_single(url): rows = [row for row in blogger_outreach_data() if utils.domain_from_url(row['url']) == \ utils.domain_from_url(url)] print rows assert rows pet = PETester(rows) pet.test()
def update_influencers_email(self, to_save=False, max_visited_links=20): log.info('Initial email field value for %r: %r', self.source_platform.influencer, self.source_platform.influencer.email) self.xbrowser.load_url(self.source_platform.url) urls = self.xbrowser.execute_jsfun_safe( [], '_XPS.visibleLinksWithTexts', ['contact', 'about', 'social', 'media', 'follow'], 40) urls = [u for u in urls if utils.domain_from_url(u) == \ utils.domain_from_url(self.xbrowser.driver.current_url)] urls = [u for u in urls if urlparse.urlsplit(u).path.rstrip('/')] urls = utils.unique_sameorder(urls) log.info('Urls to visit in search for emails: %r', urls) for page_url in urls[:max_visited_links]: try: self.xbrowser.load_url(page_url) updated = self._update_from_current_page(to_save) if updated: log.info('Current page contained email') except: log.exception('While processing %r, skipping', page_url) log.info('Final email field value for %r: %r', self.source_platform.influencer, self.source_platform.influencer.email)
def _find_platform(blog_url): blog_domain = utils.domain_from_url(blog_url) pl_candidates = models.Platform.objects.filter(url__contains=blog_domain) for pl in pl_candidates: if utils.domain_from_url(pl.url) == blog_domain: return pl return None
def get_about_page_links(xbrowser): links = xbrowser.execute_jsfun_safe( [], '_XPS.visibleLinksWithTexts', ['contact', 'about', 'social', 'media', 'follow'], 40) links = [ l for l in links if utils.domain_from_url(l) == utils.domain_from_url( xbrowser.driver.current_url) ] return utils.unique_sameorder(links)
def submit_blog_task_by_url(url): blogs = _read_blogs() matching = [ b for b in blogs if utils.domain_from_url(url) == utils.domain_from_url(b['blog_url']) ] log.info('Found matching blogs: %r', matching) if not matching: return handle_blog(matching[0])
def extract_links(self, to_save=False): posts_data = list(self.platform.posts_set.all().\ order_by('-create_date').\ values('url')\ [:COMMON_LINKS_POSTS]) if not posts_data: log.warn('No posts for common links search') return [] posts_urls = [d['url'] for d in posts_data] log.info('posts_urls: %r', posts_urls) # Maps link kind to a dictionary mapping a url to a set of urls by_kind = defaultdict(dict) for url in posts_urls: log.info('Fetching content from %r', url) by_kind['common_external'][url] = set() by_kind['common_internal'][url] = set() html_it = iter(utils.fetch_iframes(url)) while True: try: html = html_it.next() except StopIteration: break except: log.exception('While fetching html, skipping this url') continue links_texts = contentfiltering.find_links_with_texts(html) links_texts = [(u, t) for (u, t) in links_texts if not u.endswith(UNWANTED_EXTS)] links_texts = [(u, t) for (u, t) in links_texts \ if not any(ss in u for ss in BLACKLISTED_URL_SUBSTRINGS)] by_kind['common_external'][url].update([(u, t) for (u, t) in links_texts \ if utils.domain_from_url(platformutils.url_to_handle(u)) != self.source_handle]) by_kind['common_internal'][url].update([(u, t) for (u, t) in links_texts \ if utils.domain_from_url(platformutils.url_to_handle(u)) == self.source_handle]) common = defaultdict(dict) for kind, links_texts_by_url in by_kind.items(): nonempty_sets = [s for s in links_texts_by_url.values() if s] if len(nonempty_sets) < 2: log.warn('Not enough nonempty sets of links from posts for %s', kind) common[kind] = set() continue common[kind] = sorted(set.intersection(*nonempty_sets)) common[kind] = filter_links_texts(common[kind]) log.info('Common links of kind %r (%d):\n%s', kind, len(common[kind]), pformat(common[kind])) res = [] for kind, common_links_texts in common.items(): res += save_links(self.platform, kind, common_links_texts, to_save) return res
def extract_links(self, to_save=False): clusters = xutils.find_navigation_links_clusters(self.xbrowser) # flatten all clusters els = [el for cluster in clusters for el in cluster] links_texts = [(el.get_attribute('href'), el.text) for el in els if el.get_attribute('href')] log.debug('links_texts: %r', links_texts) links_texts = utils.unique_sameorder(links_texts, key=lambda lt: lt[0]) links_texts = [(link, text) for (link, text) in links_texts \ if utils.domain_from_url(link) == \ utils.domain_from_url(self.xbrowser.driver.current_url) and \ utils.url_contains_path(link)] return save_links(self.platform, 'navigation', links_texts, to_save)
def filter_urls(urls, exclude_domains_from_urls): domains = set() for eurl in exclude_domains_from_urls: if not eurl.startswith('http'): eurl = 'http://%s' % eurl domains.add(utils.domain_from_url(eurl)) domains.add('www.%s' % utils.domain_from_url(eurl)) res = [] for url in urls: if utils.domain_from_url(url) in domains: continue res.append(url) return res
def search(self, query, pages): input_el = self.xb.driver.find_element_by_xpath( '//input[@type="text"]') input_el.send_keys(query) time.sleep(1) self._find_search_button().click() time.sleep(5) #self._ensure_more_results() self.block_if_captcha() for page_no in xrange(pages): current_domains = [ utils.domain_from_url(u) for u in self._current_results() ] current_domains = [cd.split(' ', 1)[0] for cd in current_domains] current_urls = ['http://%s' % u for u in current_domains] log.info('Current google results: %s', current_urls) yield current_urls self._sleep_before_clicking_next() next_el = self.xb.driver.find_element_by_id('pnnext') next_el.click() time.sleep(5) #self._ensure_more_results() self.block_if_captcha()
def mark_brand_signup(**kw): try: streak = Streak() pipeline = streak.get_pipeline_by_name('2016') stage = pipeline.get_stage_by_name( 'TESTING' if settings.DEBUG else 'New Leads') box = stage.create_box( utils.domain_from_url(kw.get('brand_signedup_url'))) box.update_fields({ 'Brand Name': kw.get('brand_signedup_brand_name'), 'Brand URL': kw.get('brand_signedup_url'), 'Created': int(time.mktime(datetime.datetime.now().timetuple()) * 1000), 'Email': kw.get('brand_signedup_email'), 'Person': '{} {}'.format(kw.get('brand_signedup_first_name'), kw.get('brand_signedup_last_name')), 'Marketing Signup Page': kw.get('referer_tag'), }) except: pass
def _has_mostly_valid_products(self, url): domain = utils.domain_from_url(url) brand_q = models.Brands.objects.filter(domain_name=domain) if not brand_q.exists(): log.info('No brands for domain %r', domain) return False brand = brand_q[0] if brand.supported: log.info('Brand is supported, so it must be valid') # return True valid_products = brand.productmodel_set.\ filter(price__isnull=False).\ exclude(price=-11).\ count() invalid_products = (brand.productmodel_set.filter(price__isnull=True) | brand.productmodel_set.filter(price=-11)).\ count() log.info('Brand %r has %d valid and %d invalid products', brand, valid_products, invalid_products) if valid_products + invalid_products < 5: log.info( 'The number of products is too small to make an estimation') return False # 60% must be valid if float(valid_products) / float(valid_products + invalid_products) > 0.60: log.info( 'Large number of products have valid prices, assuming it is a brand' ) return True log.info('The number of valid products is too small') return False
def create_influencer_from_bad_brands(brand, to_save=True): ''' This method creates influencers from Brands whose domains contain blogger urls. Example: blogspot = Brands.objects.filter(domain_name__icontains='blogspot.") blogspot.update(blacklisted=True) for b in blogspot: create_influencer_from_bad_brands(b, True) Double checks: this function should be called only for those Brands that have not been passed through this function we shouldn't run this for brands with domain_name in 'tumblr.com', because these influencer could have a separate blog (say on blogspot.com) and then we will have duplicates ''' with platformutils.OpRecorder(operation='import_from_bad_brand', brand=brand) as opr: url = brand.domain_name domain = utils.domain_from_url(url) if domain in BLACKLISTED_DOMAINS: log.info('Domain %r is blacklisted', domain) return inf = helpers.create_influencer_and_blog_platform( url, 'discovered_from_brands', to_save, platform_name_fallback=True) if not inf: log.error('Blacklisted url: %r', url) if inf and inf.id is not None: opr.data = {'inf_id_created': [inf.id]} else: opr.data = {'inf_cnt_skipped': 1}
def import_from_post_content(post_id, to_save=True): global _DOMAINS_OF_POPULAR_BRANDS if _DOMAINS_OF_POPULAR_BRANDS is None: log.info('Starting loading _DOMAINS_OF_POPULAR_BRANDS') popular_brands = models.Brands.objects.\ filter(blacklisted=False).\ filter(num_items_shelved__gte=5).\ exclude(name='www').\ annotate(num_products=Count('productmodel')).\ order_by('-num_products')[:100] _DOMAINS_OF_POPULAR_BRANDS = [ utils.domain_from_url(b.domain_name) for b in popular_brands ] log.info('Finished loading _DOMAINS_OF_POPULAR_BRANDS') post = models.Posts.objects.get(id=int(post_id)) with platformutils.OpRecorder(operation='import_from_post_content', post=post) as opr: log.info('import_from_post_content for %r', post) _do_import_from_content(post.content, opr, to_save, blacklisted_domains=BLACKLISTED_DOMAINS + _DOMAINS_OF_POPULAR_BRANDS + estimation.URL_FRAGMENTS_NO_RESOLVING + estimation.URL_FRAGMENTS_REQUIRING_RESOLVING + estimation.URL_FRAGMENTS_IN_IFRAMES)
def search_infs_using_preloaded_urls(queries, pages=20): for q in queries: try: urls = collect_urls_from_google(q, pages) except: log.exception( 'While collect_urls_from_google(%r), going to the next query', q) continue print "Got urls: %s" % urls return for url in urls: try: if utils.domain_from_url( url) in import_from_blog_post.exclude_domains_set: log.warn('%r is blacklisted', url) continue dups = models.Influencer.find_duplicates(url) log.info('%r dups: %s', url, dups) if not dups: log.info('YES_CREATE %r', url) new_inf = helpers.create_influencer_and_blog_platform( url, 'google', platform_name_fallback=True) log.info('Created influencer: %r', new_inf) else: log.info('NO_CREATE %r', url) except: log.exception('While processing url %r, skipping', url)
def filter_links_texts(links_texts): res = [] for url, text in links_texts: domain = utils.domain_from_url(url) if domain in BLACKLISTED_DOMAINS: continue res.append((url, text)) return res
def get_or_create_brand(url): domain = utils.domain_from_url(url) brand, created = debra.models.Brands.objects.get_or_create(domain_name=domain) if created: brand.name = domain brand.save() brand_helpers.create_profile_for_brand(brand) return brand
def meaningful_domain_fragment(url): url = url.lower() if social_platform_name_from_url(None, url) != PLATFORM_NAME_DEFAULT: return None domain = utils.domain_from_url(url) domain = utils.strip_last_domain_component(domain) parts = domain.split('.') parts = [p for p in parts if p not in ['blogspot', 'wordpress']] return ''.join(parts) or None
def clean(self): cleaned_data = super(BloggerRegistrationForm, self).clean() cleaned_data["email"] = cleaned_data["email"].lower() email = cleaned_data.get("email") entered_blog_url = cleaned_data["blog_url"].lower() if not entered_blog_url.startswith( "http://") and not entered_blog_url.startswith("https://"): cleaned_data["blog_url"] = "http://" + cleaned_data["blog_url"] def is_valid_url(url): s = socket.socket() try: s.connect((url, 80)) except Exception: print "Bad url", cleaned_data["blog_url"] return False else: return True domains = [ utils.domain_from_url(cleaned_data["blog_url"], preserve_www=False), utils.domain_from_url(cleaned_data["blog_url"], preserve_www=True) ] if not any(map(is_valid_url, domains)): raise forms.ValidationError( _(u'Your blog url seems to be invalid. Please double check it.' )) # make sure another user with the given email doesnt already exist try: user = User.objects.get(username__iexact=email) if cleaned_data['influenity_signup']: pass # if not user.check_password(cleaned_data['password']): # raise forms.ValidationError(_(u'Wrong password')) else: raise forms.ValidationError( _(u'Another user with the given email already exists')) except User.DoesNotExist: pass return cleaned_data
def brands_signup_postprocess(user_profile, form, distinct_id=None): #site = Site.objects.get(id=settings.SITE_ID) from debra.models import Brands domain_name = utils.domain_from_url(form.cleaned_data['brand_url']) print "DOMAIN_NAME: %s" % domain_name brands = Brands.objects.filter(domain_name=domain_name) if brands.exists(): brand = brands[0] created = False else: brand = Brands.objects.create(domain_name=domain_name) created = True print "created: %s " % created print "brand: %s" % brand user_profile.temp_brand_domain = domain_name user_profile.save() user_profile.create_in_intercom() if form.data.get('from_admin') == 'true': user_profile.intercom_tag_add('dont-send-intro-email') user_profile.intercom_tag_add('customer_ignore') if form.referer_tag: user_profile.intercom_tag_add(form.referer_tag) # referer_page = urlparse.urlparse(form.referer).path.strip('/').split('/')[0] # print '* REFERER:', referer_page # try: # tag = { # '': 'home', # 'blogger-outreach': 'newbie', # 'influencer-marketing': 'expert', # 'agencies': 'agency', # 'blogger-campaign-services': 'services', # 'coverage': 'coverage', # 'the-blog': 'blog', # 'blogger-roundups': 'roundups', # }[referer_page] # except KeyError: # pass # else: # user_profile.intercom_tag_add(tag) # if this is a new brand we know the user signing up is the brand manager. Otherwise, users have to claim the brand from us if created: brand.name = form.cleaned_data['brand_name'] brand.save() brand_helpers.create_profile_for_brand(brand) intercom_track_event(None, 'brand-signed-up', { 'user_email': user_profile.user.email, 'brand_url': domain_name }, user_profile.user)
def domain_to_platform(domain): global _DOMAIN_TO_PLATFORM_ID if _DOMAIN_TO_PLATFORM_ID is None: log.info('Start fetching platform data') _DOMAIN_TO_PLATFORM_ID = {} for d in models.Platform.objects.all().values('id', 'url'): _DOMAIN_TO_PLATFORM_ID[utils.domain_from_url(d['url'])] = d['id'] log.info('Finished') if domain not in _DOMAIN_TO_PLATFORM_ID: return None return models.Platform.objects.get(id=_DOMAIN_TO_PLATFORM_ID[domain])
def extract_links(self, to_save=False): html = self.xbrowser.driver.execute_script( 'return document.body.innerHTML') domain = utils.domain_from_url(self.xbrowser.driver.current_url) urls = contentfiltering.find_important_urls(html, [domain, 'www.' + domain]) log.info('important urls (%s): %s', len(urls), urls) res = [] for u in urls: pl = domain_to_platform(utils.domain_from_url(u)) if pl is not None and pl.id != self.platform.id: log.info('detected link from <<%s>> to <<%s>> url <<%s>>', self.platform, pl, u) lfp = models.LinkFromPlatform(source_platform=self.platform, dest_platform=pl, dest_url=u) if to_save: lfp.save() res.append(lfp) return res
def _get_brand(prod_url): log.debug("_get_brand for %s" % prod_url) domain = utils.domain_from_url(prod_url) brand, created = Brands.objects.get_or_create(domain_name=domain) if brand.name == 'Nil': brand.name = domain.replace('www.', '').replace('.com', '').replace('/', '') brand.save() log.debug("Created: %s Brand: %s Domain: %s" % (created, brand, domain)) return brand
def find_common_links(xbrowser, urls): domains = [utils.domain_from_url(u) for u in urls] assert len( set(domains)) == 1, 'urls are not for the same domain: %s' % domains domain = domains[0] links_by_url = {} for u in urls: xbrowser.load_url(u) links_by_url[u] = xbrowser.execute_jsfun('_XPS.visibleLinksToDomains', [domain], True) links_by_url[u] = [link.strip() for link in links_by_url[u]] common_links = set.intersection(*[set(v) for v in links_by_url.values()]) return common_links
def import_network_bloggers(filename): with open(filename, 'rb') as f: lines = f.readlines()[1:] reader = csv.DictReader(lines, ('unusual', 'blog_name', 'url', 'persons_name', 'location', 'source', 'description')) blogger_type = os.path.basename(filename).split('.')[0].split(' - ')[1] log.info('blogger_type: %r', blogger_type) for row in reader: try: log.info('row: %r', row) if not row['url'].startswith('http'): log.warn('Skipping row with invalid url %r', row['url']) continue source = utils.domain_from_url(row['source']) if not source.strip(): log.warn('Skipping row with no source') continue if not row['url'].strip(): log.warn('Skipping row with no url') continue inf = helpers.create_influencer_and_blog_platform( row['url'], source, to_save=True, platform_name_fallback=True) if not inf: log.warn('Skipping blacklisted url') continue if not inf.is_enabled_for_automated_edits(): log.warn( 'Influencer is not enabled for automated edits, skipping') continue inf.blogname = row['blog_name'] inf.blogger_type = blogger_type inf.name = row['persons_name'] inf.demographics_location = row['location'] inf.description = row['description'] log.info( 'source, blogname, name, location, description: %r, %r, %r, %r, %r', inf.source, inf.blogname, inf.name, inf.demographics_location, inf.description[:100]) inf.save() # update blogname for blog platform blog_pl_q = inf.platform_set.filter(url=row['url']) if blog_pl_q.exists(): blog_pl = blog_pl_q[0] log.info('Updating blogname of %r', blog_pl) blog_pl.blogname = row['blog_name'] blog_pl.save() except: log.exception('While processing %s, skipping', row)
def do_extract_product_urls(url): domain = utils.domain_from_url(url) matching_classes = [ cls for cls in CLASSES if domain in cls.supported_domains ] res = [] for cls in matching_classes: e = cls() e_res = e.extract_product_urls(url) log.info('%r extracted product urls: %r', e, e_res) res += e_res res = utils.unique_sameorder(res) log.info('All product urls extracted from %r: %r', url, res) return res
def extract_product_urls(self, url): try: with xbrowser.XBrowser(headless_display=settings. AUTOCREATE_HEADLESS_DISPLAY) as xb: xb.load_url(url) anchors = WebDriverWait(xb.driver, 10).until( lambda _: xb.els_by_xpath('//div[@class="hoverflow"]//a')) anchors = [a for a in anchors if a.get_attribute('href') and \ utils.domain_from_url(a.get_attribute('href')) == 'rstyle.me'] urls = utils.unique_sameorder( a.get_attribute('href') for a in anchors) return urls except Exception as e: log.exception(e, extra={'url': url}) return None
def _get_product_urls(post): product_urls_in_post = post.product_urls(exclude_domains) # add urls from text links for non-blog platforms if not post.platform.platform_name_is_blog: content = platformutils.iterate_resolve_shortened_urls(post.content) product_urls_in_post.update( contentfiltering.filter_urls( contentfiltering.find_all_urls(content), exclude_domains)) log.debug("We have %d product urls in the post content: %s" % (len(product_urls_in_post), product_urls_in_post)) post.test_and_set_sponsored_flag() product_urls_in_widgets = sponsorshipfetcher.get_product_urls(post.id) log.debug("Products in widgets: %s" % product_urls_in_widgets) log.debug("We have %d product urls in the widget " % len(product_urls_in_widgets)) additional_product_url_candidates = [] if post.pin_source: additional_product_url_candidates.append(post.pin_source) influencer_blog_platforms = post.influencer.platform_set.filter( platform_name__in=Platform.BLOG_PLATFORMS) additional_product_urls = contentfiltering.filter_urls( additional_product_url_candidates, exclude_domains + [plat.url for plat in influencer_blog_platforms]) product_urls = product_urls_in_post.union(product_urls_in_widgets).union( additional_product_urls) # extract product urls from embedded urls urls_for_urls_extraction = [ u for u in product_urls if utils.domain_from_url(u) in producturlsextractor.ALL_SUPPORTED_DOMAINS ] products_urls_extracted = [] for url in urls_for_urls_extraction: products_urls_extracted += producturlsextractor.do_extract_product_urls( url) log.info('All products_urls_extracted: %r', products_urls_extracted) product_urls.update(products_urls_extracted) return product_urls
def influencers_without_blog_platform_by_domain(): infs = Influencer.objects.filter(source='spreadsheet_import', blog_url__isnull=False) invalid_urls = [] for i, inf in enumerate(infs): print i if not inf.platform_set.filter( platform_name__in=['Custom', 'Blogspot', 'Wordpress' ]).exists(): invalid_urls.append(inf.blog_url) print 'Got %s invalid urls: %r' % (len(invalid_urls), invalid_urls) by_domain = defaultdict(list) for url in invalid_urls: by_domain[_master_domain(utils.domain_from_url(url))].append(url) by_domain_items = sorted(by_domain.items(), key=lambda (domain, urls): len(urls), reverse=True) pprint.pprint(by_domain_items)
def create_influencers_from_blacklisted_brands(): blogspot_brands = models.Brands.objects.filter( domain_name__icontains='blogspot') print "Got %d brands with blacklisted" % blogspot_brands.count() good_urls = [] for i, b in enumerate(blogspot_brands): print "%d %r" % (i, b) url = b.domain_name.lower() if utils.domain_from_url( url) in import_from_blog_post.exclude_domains_set: log.warn('%r is blacklisted', url) continue dups = models.Influencer.find_duplicates(url) log.info('%r dups: %s', url, dups) if not dups: print "Can create a new influencer for %s" % url good_urls.append(url) print "Good urls so far: %d" % len(good_urls)
def _get_own_frames(self, url, tree): if not url or tree is None: return [] frame_like = tree.xpath('//iframe') + tree.xpath('//frame') srcs_to_check = [] valid_fragments = [ 'blogspot', platformutils.meaningful_domain_fragment(url) ] valid_fragments = [vf for vf in valid_fragments if vf] for fl in frame_like: src = fl.attrib.get('src') if not src: continue domain = utils.domain_from_url(src) if any(vf in domain for vf in valid_fragments): srcs_to_check.append(src) log.info('srcs_to_check: %r', srcs_to_check) srcs_to_check = srcs_to_check[:self.DISCOVERY_FRAME_LIMIT] return srcs_to_check