Example #1
0
    def _scrape_page(self, html):
        soup = BeautifulSoup(html, "html.parser")
        raw_content = soup.find_all('div', attrs={'class': 'g'})
        results = []
        for content in raw_content:
            link_tag = content.find('a')
            link_url = link_tag.get('href').split('&sa')
            if not len(link_url):
                continue
            link_url = link_url[0]
            unwanted_head_list = UNWANTED_URL_HEAD_LIST
            for unwanted_head in unwanted_head_list:
                if link_url.find(unwanted_head) > -1:
                    link_url = link_url[len(unwanted_head):]

            link_text = link_tag.get_text()
            short_text = content.find('span', attrs={'class': 'st'})
            if short_text is None:
                continue

            short_text = short_text.get_text().encode('ascii', errors='ignore')

            # We'll hash the short text + link url, and if it exists in the list of global hashes,
            #   then we will stop scraping as we've reached google's last page
            result_hash = get_hash8(short_text + link_url)
            if result_hash in self.global_hashes:
                self.stop_search = True
                return []

            self.global_hashes.append(result_hash)

            content = {
                'url': link_url,
                'url_text': link_text,
                'text': short_text
            }

            results.append(content)

        return results
Example #2
0
    def _scrape_page(self, html):
        soup = BeautifulSoup(html, "html.parser")
        raw_content = soup.find_all('div', attrs={'class': 'g'})
        results = []
        for content in raw_content:
            link_tag = content.find('a')
            link_url = link_tag.get('href').split('&sa')
            if not len(link_url):
                continue
            link_url = link_url[0]
            unwanted_head_list = UNWANTED_URL_HEAD_LIST
            for unwanted_head in unwanted_head_list:
                if link_url.find(unwanted_head) > -1:
                    link_url = link_url[len(unwanted_head):]

            link_text = link_tag.get_text()
            short_text = content.find('span', attrs={'class': 'st'})
            if short_text is None:
                continue

            short_text = short_text.get_text().encode('ascii', errors='ignore')

            # We'll hash the short text + link url, and if it exists in the list of global hashes,
            #   then we will stop scraping as we've reached google's last page
            result_hash = get_hash8(short_text + link_url)
            if result_hash in self.global_hashes:
                self.stop_search = True
                return []

            self.global_hashes.append(result_hash)

            content = {
                'url': link_url,
                'url_text': link_text,
                'text': short_text
            }

            results.append(content)

        return results
Example #3
0
    def _scan_images(self, report=None):
        """
        Scan user images and analyse content
        :param report: Associated Report object
        :return: None
        """
        pass
        if not report:
            formatted_date = str(datetime.date.today().strftime('%A %d %b %Y, %I:%M%p'))
            report = Report.objects.create(name=formatted_date, user_profile=self)
        # model = facial_recognition.get_model()
        model = None
        if not model:
            print "Could not find face recognition model"
            return

        fullname = "%s %s" % (self.user.first_name, self.user.last_name)

        image_search = GoogleImageSearch(fullname, start=0, num=50, search_type="face")

        attempts = 0

        content_list = image_search.search()

        while not len(content_list) and attempts <= 5:
            content_list = image_search.search()
            attempts += 1

        for content in content_list:
            print content
            img_url = content.get("img_url") or None
            if not img_url:
                continue
            temp_file = os.path.abspath("media\\temp\\%s.jpg" % str(uuid.uuid4()))
            print temp_file
            try:
                urllib.urlretrieve(img_url, temp_file)
                # img = detect_face(temp_file)
                img = img.convert("L")
                os.remove(temp_file)
            except Exception as e:
                try:
                    os.remove(temp_file)
                except Exception as e:
                    print e
                continue
            img = img.convert("L")
            # p = model.predict(img)
            p = None
            if p == str(self.pk):
                user = self
                type = 'photo'
                source = 'web'
                source_content = content.get('text') or None
                url = content.get('img_url', None)
                extra_data = {"page_url": content.get('page_url')}
                hashed_url = get_hash8(url)

                try:
                    UserContent.objects.get(hashed_url=hashed_url, hidden=False, user=user).soft_delete()
                except UserContent.DoesNotExist:
                    pass

                try:
                    UserContent.objects.create(
                        user=user, type=type, source=source, content=source_content, url=url, hashed_url=hashed_url,
                        extra_data=extra_data, hidden=False, report=report
                    )
                except Exception, e:
                    print e
Example #4
0
            try:
                twitter_content = tc.run()
                break
            except Exception, e:
                print e
            attempts += 1
        for item in twitter_content:
            content_type = 'text'
            user = self
            source = 'twitter'
            content = item['text']
            url = item['url']
            post_id = item['id']
            created_at_timestamp = item['created_at_timestamp']
            created_at_datetime = datetime.datetime.fromtimestamp(created_at_timestamp)
            hashed_url = get_hash8(url)
            sentiment_analysis = item.get('analysis', None)
            neg_sentiment_rating = None
            pos_sentiment_rating = None
            neut_sentiment_rating = None
            sentiment_label = None

            if not sentiment_analysis:
                try:
                    sentiment_analysis = analyse_text(content)
                    print sentiment_analysis
                except Exception as e:
                    print e
                    pass

            if sentiment_analysis: