def _scrape_page(self, html): soup = BeautifulSoup(html, "html.parser") raw_content = soup.find_all('div', attrs={'class': 'g'}) results = [] for content in raw_content: link_tag = content.find('a') link_url = link_tag.get('href').split('&sa') if not len(link_url): continue link_url = link_url[0] unwanted_head_list = UNWANTED_URL_HEAD_LIST for unwanted_head in unwanted_head_list: if link_url.find(unwanted_head) > -1: link_url = link_url[len(unwanted_head):] link_text = link_tag.get_text() short_text = content.find('span', attrs={'class': 'st'}) if short_text is None: continue short_text = short_text.get_text().encode('ascii', errors='ignore') # We'll hash the short text + link url, and if it exists in the list of global hashes, # then we will stop scraping as we've reached google's last page result_hash = get_hash8(short_text + link_url) if result_hash in self.global_hashes: self.stop_search = True return [] self.global_hashes.append(result_hash) content = { 'url': link_url, 'url_text': link_text, 'text': short_text } results.append(content) return results
def _scan_images(self, report=None): """ Scan user images and analyse content :param report: Associated Report object :return: None """ pass if not report: formatted_date = str(datetime.date.today().strftime('%A %d %b %Y, %I:%M%p')) report = Report.objects.create(name=formatted_date, user_profile=self) # model = facial_recognition.get_model() model = None if not model: print "Could not find face recognition model" return fullname = "%s %s" % (self.user.first_name, self.user.last_name) image_search = GoogleImageSearch(fullname, start=0, num=50, search_type="face") attempts = 0 content_list = image_search.search() while not len(content_list) and attempts <= 5: content_list = image_search.search() attempts += 1 for content in content_list: print content img_url = content.get("img_url") or None if not img_url: continue temp_file = os.path.abspath("media\\temp\\%s.jpg" % str(uuid.uuid4())) print temp_file try: urllib.urlretrieve(img_url, temp_file) # img = detect_face(temp_file) img = img.convert("L") os.remove(temp_file) except Exception as e: try: os.remove(temp_file) except Exception as e: print e continue img = img.convert("L") # p = model.predict(img) p = None if p == str(self.pk): user = self type = 'photo' source = 'web' source_content = content.get('text') or None url = content.get('img_url', None) extra_data = {"page_url": content.get('page_url')} hashed_url = get_hash8(url) try: UserContent.objects.get(hashed_url=hashed_url, hidden=False, user=user).soft_delete() except UserContent.DoesNotExist: pass try: UserContent.objects.create( user=user, type=type, source=source, content=source_content, url=url, hashed_url=hashed_url, extra_data=extra_data, hidden=False, report=report ) except Exception, e: print e
try: twitter_content = tc.run() break except Exception, e: print e attempts += 1 for item in twitter_content: content_type = 'text' user = self source = 'twitter' content = item['text'] url = item['url'] post_id = item['id'] created_at_timestamp = item['created_at_timestamp'] created_at_datetime = datetime.datetime.fromtimestamp(created_at_timestamp) hashed_url = get_hash8(url) sentiment_analysis = item.get('analysis', None) neg_sentiment_rating = None pos_sentiment_rating = None neut_sentiment_rating = None sentiment_label = None if not sentiment_analysis: try: sentiment_analysis = analyse_text(content) print sentiment_analysis except Exception as e: print e pass if sentiment_analysis: