Ejemplo n.º 1
0
class GetText(object):
    ''' Get meme text
    '''
    def __init__(self, lang=None, tess_dir=None):
        ''' lang: language for tesseract
        tess_dir: tesseract training directory
        initialize an ocr object, language, tesseract directory, and extensions
        '''
        self._ocr = OCR()
        self.lang = lang
        self.tess_dir = tess_dir
        self.EXTENSIONS = ('.jpg',
                           '.png',
                           '.gif'
                           )

    def strip_text(self, text):
        ''' text: str
        replace return characters with new lines
        '''
        return text.replace('\r', '').replace('\n', '')

    def process_image(self, img):
        ''' img: url of image
        Perform ocr on image and return text
        '''
        text = self._ocr.process_image(self._ocr.get_image(img), self.lang,
                                       self.tess_dir)
        return self.strip_text(text)

    ########################################
    # imgflip
    def is_imgflip_direct_url(self, url):
        ''' url: image url
        returns true if direct imgflip image url
        returns false otherwise
        '''
        if 'i.imgflip' in url:
            return True
        return False

    def transform_imgflip_url(self, url):
        ''' url: image url
        returns imgflip page containing image
        '''
        if self.is_imgflip_direct_url(url):
            prefix = url[:url.index('i')]
            suffix = url[url.rindex('/'):url.rindex('.')]
            return prefix + 'imgflip.com/i' + suffix
        else:
            return url

    def get_imgflip(self, url):
        ''' url: image url
        returns meme type, meme text
        '''
        page = requests.get(url).text
        soup = BeautifulSoup(page, 'lxml')
        img = soup.find('img', {'id': 'im'})
        # index 0 is meme
        # index 1 is text
        meme = img.attrs['alt'].split('|')
        return (meme[0].strip(), meme[1].strip())

    ########################################
    # makeameme
    def is_makeameme_direct_url(self, url):
        ''' url: image url
        returns true if direct makeameme image url.
        returns false otherwise
        '''
        if 'media.makeameme' in url:
            return True
        return False

    def transform_makeameme_url(self, url):
        ''' url: image url
        returns makeameme page containing image
        '''
        if self.is_makeameme_direct_url(url):
            prefix = url[:url.index('media')]
            suffix = url[url.rindex('/'):url.rindex('.')]
            return prefix + 'makeameme.org/meme' + suffix
        else:
            return url

    def get_makeameme(self, url):
        ''' url: image url
        returns meme text
        '''
        page = requests.get(url).text
        soup = BeautifulSoup(page, 'lxml')
        div = soup.findAll('div', {'class': 'small-12 text-center'})
        meme = []
        for text in div[len(div) - 1].text.split('\n'):
            if len(text) > 0 and 'add your own captions' not in text:
                meme.append(text.strip())
        # index 0 is meme
        # index 1 is text
        return meme

    ########################################
    # livememe
    def is_livememe_direct_url(self, url):
        ''' url: image url
        returns true if direct livememe image url.
        returns false otherwise
        '''
        if 'lvme.me' in url or any(ext in url for ext in self.EXTENSIONS):
            return True
        return False

    def transform_livememe_url(self, url):
        ''' url: image url
        returns livememe page containing image
        '''
        if self.is_livememe_direct_url(url):
            suffix = url[url.rindex('/'):url.rindex('.')]
            return 'http://www.livememe.com' + suffix
        else:
            return url

    def get_livememe(self, url):
        ''' url: image url
        returns meme type, meme text
        '''
        page = requests.get(url).text
        soup = BeautifulSoup(page, 'lxml')
        style1 = 'word-wrap: break-word; font-weight: bold;'
        style2 = 'word-wrap: break-word; margin-top: 11px;'
        text = []
        try:
            meme_type = soup.findAll('div', {'style': style1})[0].text.strip()
        except IndexError:
            meme_type = None
        memeText = soup.findAll('div', {'style': style2})
        for i in memeText:
            text.append(i.text.upper().strip())
        # index 0 is meme
        # index 1 is text
        return (meme_type, text)

    ########################################
    # memecaptain
    def is_memecaptain_direct_url(self, url):
        ''' url: image url
        returns true if direct memecaptain image url.
        returns false otherwise
        '''
        if any(ext in url for ext in self.EXTENSIONS):
            return True
        return False

    def transform_memecaptain_url(self, url):
        ''' url: image url
        returns memecaptain page containing image
        '''
        if self.is_livememe_direct_url(url):
            suffix = url[url.rindex('/'):url.rindex('.')]
            return 'https://memecaptain.com/gend_image_pages' + suffix
        else:
            return url

    def get_memecaptain(self, url):
        ''' url: image url
        returns meme type, meme text
        '''
        page = requests.get(url).text
        soup = BeautifulSoup(page, 'lxml')
        text = []
        try:
            meme_type = soup.findAll('small')[0].text.strip()
        except IndexError:
            meme_type = None
        memeText = soup.findAll('h1')
        for i in memeText:
            text.append(i.text.upper().strip())
        return (meme_type, text)

    ########################################
    # memegen
    def is_memegen_direct_url(self, url):
        ''' url: image url
        returns true if direct memegen image url.
        returns false otherwise
        '''
        if any(ext in url for ext in self.EXTENSIONS):
            return True
        return False

    def transform_memegen_url(self, url):
        ''' url: image url
        returns memegen page containing image
        '''
        if self.is_livememe_direct_url(url):
            suffix = url[url.rindex('/'):url.rindex('.')]
            return 'http://memegen.com/meme' + suffix
        else:
            return url

    def get_memegen(self, url):
        ''' url: image url
        returns meme type, meme text
        '''
        page = requests.get(url).text
        soup = BeautifulSoup(page, 'lxml')
        img = soup.find('img', {'class': 'img-polaroid'})
        meme = img.attrs['alt']
        try:
            meme_type = meme[:meme.index(':')].strip()
        except:
            meme_type = None
        try:
            text = meme[meme.index(':') + 1:meme.rindex('-')].strip()
        except:
            text = None
        return (meme_type, text)

    ########################################
    # imgur
    def get_imgur(self, url):
        ''' url: image url
        returns direct imgur image
        '''
        page = requests.get(url).text
        soup = BeautifulSoup(page, 'lxml')
        img = soup.find('link', {'rel': 'image_src'}).attrs['href']
        return img

    ########################################
    def get_meme_text(self, imageUrl):
        '''
        imageUrl: image url
        returns meme text
        '''
        if 'imgflip' in imageUrl:
            img_flip_url = self.transform_imgflip_url(imageUrl)
            meme_list = self.get_imgflip(img_flip_url)
            meme_type = meme_list[0]
            text = meme_list[1]
            return (text, meme_type)
        elif 'makeameme' in imageUrl:
            makeameme_url = self.transform_makeameme_url(imageUrl)
            meme_list = self.get_makeameme(makeameme_url)
            meme_type = meme_list[0]
            text = meme_list[1]
            return (text, meme_type)
        elif 'livememe' in imageUrl or 'lvme.me' in imageUrl:
            livememe_url = self.transform_livememe_url(imageUrl)
            meme_list = self.get_livememe(livememe_url)
            meme_type = meme_list[0]
            meme_text = meme_list[1]
            text = ''
            for i in meme_text:
                text += i.strip() + '\n'
            return (text, meme_type)
        elif 'memecaptain' in imageUrl:
            memecaptain_url = self.transform_memecaptain_url(imageUrl)
            meme_list = self.get_memecaptain(memecaptain_url)
            meme_type = meme_list[0]
            meme_text = meme_list[1]
            text = ''
            for i in meme_text:
                text += i.strip() + '\n'
            return (text, meme_type)
        elif 'memegen' in imageUrl:
            memegen_url = self.transform_memegen_url(imageUrl)
            meme_list = self.get_memegen(memegen_url)
            meme_type = meme_list[0]
            text = meme_list[1]
            if text is None and self.is_memegen_direct_url(imageUrl):
                text = self.process_image(imageUrl)
            return (text, meme_type)
        # imgur webpage. get direct image and run ocr
        elif '//imgur' in imageUrl:
            img = self.get_imgur(imageUrl)
            return (self.process_image(img), None)
        # direct image urls. primarily imgur and i.redd.it
        elif any(ext in imageUrl for ext in self.EXTENSIONS):
            return (self.process_image(imageUrl), None)
        # website urls. need to get text if exists or img to run ocr on
        else:
            # return '*' * 10
            return (None, None)