Example #1
0
    def first(self,journal_temp):
        urls = []

        data = requests.get(journal_temp[Row_Name.TEMP_URL],verify=False)
        soup = BeautifulSoup(data.text, "html.parser")
        # print(soup)

        div = soup.find("div", id="primarycontent")
        h4 = div.find_all("h4")
        p = div.find_all("p")
        soup.extract()
        if len(h4) == len(p):
            for title, i in zip(h4, p):
                article_info = dict(journal_temp)
                a = i.find("a")
                # print("http://bmathaa.org/" + a["href"])
                title = title.get_text()
                t = re.match("\d*\.", title)
                if t != None:
                    title = title.replace(t.group(), "").strip()
                # print(title)
                a.extract()
                au=i.get_text().replace("Author:", "").replace("Authors: ", "").replace(" and ", "##").replace(",","##")
                article_info[Row_Name.TITLE]=title
                article_info[Row_Name.AUTHOR_NAME]=au
                article_info[Row_Name.FULLTEXT_URL]="http://bmathaa.org/" + a["href"]
                urls.append(article_info)


        return urls
def getText(link):
    print(" - ", link)
    try:
        r = requests.get(link, verify=False, timeout=5)
    except KeyboardInterrupt:
        raise
    except:
        # print("Error")
        return ""

    b = BeautifulSoup(r.text, "lxml")
    [b.extract() for b in b(['script', 'link', 'meta', 'style'])]
    text = [b.extract() for b in b(['p', 'div'])]
    output = ""
    outputList = []
    for excerpt in text:
        found = excerpt.get_text(separator=" ").strip().replace(
            "\xa0", ". ").replace("\n",
                                  ". ").replace("\t",
                                                ". ").replace("\r",
                                                              ". ").strip()
        while "  " in found:  # Remove double spaces
            found = found.replace("  ", " ")
        while ". . " in found:
            found = found.replace(". . ", ". ")
        if found.count(" ") > 5:
            outputList.append(found)


#        sentences = found.split(". ")
#        for sentence in sentences:
#            stripped = sentence.strip()
#
#            while "  " in stripped: # Remove double spaces
#                stripped = stripped.replace("  ", " ")
#            while ". . " in stripped:
#                stripped = stripped.replace(". . ", ". ")
#
#            if stripped.count(" ") < 5:
#                continue
#
#            output += stripped + ". "
#            outputList.append(stripped)

#    while "  " in output: # Remove double spaces
#        output = output.replace("  ", " ")
#    while ". . " in output:
#        output = output.replace(". . ", ". ")
#    return outputList
    return outputList
Example #3
0
    def get_lyric(self, singer, song):
        # Replace spaces with _
        singer = singer.replace(' ', '_')
        song = song.replace(' ', '_')
        url = 'http://lyrics.wikia.com/{0}:{1}'.format(singer, song)
        req = requests.get(url)
        s = BeautifulSoup(req.text, "lxml")
        # Get main lyrics holder
        lyrics = s.find("div", {'class': 'lyricbox'})
        if lyrics is None:
            return None
        # Remove Scripts
        [s.extract() for s in lyrics('script')]
        # Remove comments
        comments = lyrics.findAll(text=lambda text: isinstance(text, Comment))
        # Remove unecessary tags
        for tag in ['div', 'i', 'b', 'a']:
            for match in lyrics.findAll(tag):
                match.replaceWithChildren()

        # TODO: check if you need the encode/decode thing, if you do then do a try catch for it

        # get output as string and remove non unicode characters and replace <br> with newlines
        #output = str(lyrics).encode('utf-8', errors = 'replace')[22:-6:].decode('utf-8').replace('\n','').replace('<br/>','\n')
        output = str(lyrics).replace('\n', '').replace('<br/>', '\n')[22:-6:]
        try:
            return output
        except:
            return output.encode('utf-8')
Example #4
0
	def getLyrics(singer, song):
		#Replace spaces with _
		singer = singer.replace(' ', '_')
		song = song.replace(' ', '_')
		r = requests.get('http://lyrics.wikia.com/{0}:{1}'.format(singer,song))
		s = BeautifulSoup(r.text)
		#Get main lyrics holder
		lyrics = s.find("div",{'class':'lyricbox'})
		if lyrics is None:
			raise ValueError("Song or Singer does not exist or the API does not have Lyrics")
			return None
		#Remove Scripts
		[s.extract() for s in lyrics('script')]

		#Remove Comments
		comments = lyrics.findAll(text=lambda text:isinstance(text, Comment))
		[comment.extract() for comment in comments]

		#Remove unecessary tags
		for tag in ['div','i','b','a']:
			for match in lyrics.findAll(tag):
				match.replaceWithChildren()
		#Get output as a string and remove non unicode characters and replace <br> with newlines
		output = str(lyrics).encode('utf-8', errors='replace')[22:-6:].decode("utf-8").replace('\n','').replace('<br/>','\n')
		try:
			return output
		except:
			return output.encode('utf-8')
Example #5
0
def lyricswikia(artist, song):
    # original code found @
    # https://github.com/geekpradd/PyLyrics/blob/master/PyLyrics/functions.py
    song = song.split(' - ', 1)[0]
    artist = artist.replace(' ', '_')
    song = song.replace(' ', '_')
    url = 'http://lyrics.wikia.com/{0}:{1}'.format(artist, song)
    print('Trying:', url)
    r = requests.get(url)
    s = BeautifulSoup(r.text, 'html.parser')
    # Get main lyrics holder
    lyrics = s.find("div", {'class': 'lyricbox'})
    if lyrics is not None:
        # Remove Scripts
        [s.extract() for e in lyrics('script')]

        # Remove Comments
        comments = lyrics.findAll(text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments]

        # Remove unecessary tags
        for tag in ['div', 'i', 'b', 'a']:
            for match in lyrics.findAll(tag):
                match.replaceWithChildren()
        # Get output as a string and remove non unicode characters and replace
        # <br> with newlines
        lyrics = str(lyrics).encode(
            'utf-8', errors='replace')[22:-6:].decode("utf-8").replace(
                '\n', '').replace('<br/>', '\n')
    try:
        return lyrics
    except:
        return lyrics.encode('utf-8')
Example #6
0
    def getLyrics(singer, song):
        #Replace spaces with _
        singer = singer.replace(' ', '_')
        song = song.replace(' ', '_')
        r = requests.get('http://lyrics.wikia.com/{0}:{1}'.format(
            singer, song))
        s = BeautifulSoup(r.text, 'html.parser')
        #Get main lyrics holder
        lyrics = s.find("div", {'class': 'lyricbox'})
        if lyrics is None:
            raise ValueError(
                "Song or Singer does not exist or the API does not have Lyrics"
            )
        #Remove Scripts
        [s.extract() for s in lyrics('script')]

        #Remove Comments
        comments = lyrics.findAll(text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments]

        #Remove unecessary tags
        for tag in ['div', 'i', 'b', 'a']:
            for match in lyrics.findAll(tag):
                match.replaceWithChildren()
        #Get output as a string and remove non unicode characters and replace <br> with newlines
        output = str(lyrics).encode(
            'utf-8', errors='replace')[22:-6:].decode("utf-8").replace(
                '\n', '').replace('<br/>', '\n')
        try:
            return output
        except:
            return output.encode('utf-8')
Example #7
0
    def test_extract_with_arg(self):
        ''' Test extracting by calling extract() with a random argument. '''

        markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
        extr_arg_soup = BeautifulSoup(markup, "html.parser")
        with self.assertRaises(TypeError):
            extr_arg_soup_extracted = extr_arg_soup.extract("arg")
Example #8
0
def getSalesRank(salesRankFrom, asin, salesRankTo):
    """ method to check if the sales rank of the product lies in the range provided by the user
    salesRankFrom : lower limit for sales rank entered by the user
    asin : asin of the product
    salesRankTo : upper limit for sales rank entered by the user
    """
    if salesRankFrom and salesRankTo:
        url = 'http://www.amazon.co.uk/dp/' + asin
        mech = Browser()
        session = requests.Session()
        proxy = check_proxy(session)

        if proxy != 'NA':
            mech.set_proxies({'http': proxy})

        page = mech.open(url)
        html = page.read()

        # try to get the sales rank else return True. Does not work sometimes. Need to be more robust.
        try:
            soup = BeautifulSoup(str(html))
            data = BeautifulSoup.extract((soup))
            salesRankElem = data.find(attrs={
                'id': 'SalesRank'
            }).find(attrs={
                'class': 'value'
            }).getText()
            salesRank = re.findall(r'\n(.*?)\sin', salesRankElem)[0].replace(
                'in', '').replace(',', '').strip()
            return salesRankFrom <= int(salesRank) <= salesRankTo
        except Exception, e:
            return True
Example #9
0
    def buildInteractions(self):
        f = open(self.__name+".txt", "a")   
        page = requests.get(self.__link)
        soup = BeautifulSoup(page.content, 'html.parser')
        div1 = (soup.select("ul.interactions.ddc-list-unstyled li.int_1 a")) 
        soup = BeautifulSoup(str(div1)[1:-1],'html.parser')
        soup = BeautifulSoup(str(soup.extract()),'html.parser')
        level1 = str(soup)
        if(level1.strip()==""):
            f.write("No level 1 interactions\n")
        else:
            f.write("Level 1 interactions:\n")
            for level in soup:
                curr =re.sub('<a href=[^>]+>', '',str(level))
                if(curr[0] != ','):
                    f.write(curr.replace('</a>',"")+"\n")
                
        
        soup = BeautifulSoup(page.content, 'html.parser')
        div2 = soup.select("ul.interactions.ddc-list-unstyled li.int_2 a")
        soup = BeautifulSoup(str(div2)[1:-1],'html.parser')
        soup = BeautifulSoup(str(soup.extract()),'html.parser')
        level2 = str(soup)
        if(level2.strip()==""):
            f.write("No level 2 interactions\n")
        else:
            f.write("Level 2 interactions:\n")
            for level in soup:
                curr =re.sub('<a href=[^>]+>', '',str(level))
                if(curr[0] != ','):
                    f.write(curr.replace('</a>',"")+"\n")

        soup = BeautifulSoup(page.content, 'html.parser')
        div3 = soup.select("ul.interactions.ddc-list-unstyled li.int_3 a")
        soup = BeautifulSoup(str(div3)[1:-1],'html.parser')
        soup = BeautifulSoup(str(soup.extract()),'html.parser')
        level3 = str(soup)
        if(level3.strip()==""):
            f.write("No level 3 interactions\n")
        else:
            f.write("Level 3 interactions:\n")
            for level in soup:
                curr =re.sub('<a href=[^>]+>', '',str(level))
                if(curr[0] != ','):
                    f.write(curr.replace('</a>',"")+"\n")
        f.close()
Example #10
0
    def test_extract_raises(self):
        ''' Test extracting a non-existing tag, ensuring that the original isn't changed. '''

        markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
        extr_soup = BeautifulSoup(markup, "html.parser")
        extr_null_soup = extr_soup.extract()

        self.assertEqual(extr_soup, extr_null_soup)
Example #11
0
def getSalesRank(salesRankFrom, asin, salesRankTo):
    """ method to check if the sales rank of the product lies in the range provided by the user
    salesRankFrom : lower limit for sales rank entered by the user
    asin : asin of the product
    salesRankTo : upper limit for sales rank entered by the user
    """
    salesRank = 'NA'

    url = 'http://www.amazon.co.uk/dp/' + unidecode(asin)
    mech = Browser()
    session = requests.Session()
    proxy = check_proxy(session)

    if proxy != 'NA':
        mech.set_proxies({'http': proxy})

    page = mech.open(url)
    if page.code == 503:
        print page.geturl()
    html = page.read()

    # try to get the sales rank else return True. Does not work sometimes. Need to be more robust.
    try:
        soup = BeautifulSoup(str(html))
        data = BeautifulSoup.extract((soup))
        salesRankElem = data.find(attrs={
            'id': 'SalesRank'
        }).find(attrs={
            'class': 'value'
        }).getText()
        salesRank = re.findall(r'\n(.*?)\sin', salesRankElem)[0].replace(
            'in', '').replace(',', '').strip()
        if salesRankFrom and salesRankTo:
            return (salesRankFrom <= int(salesRank) <= salesRankTo), salesRank
        else:
            return True, salesRank
    except Exception, e:
        try:
            salesRankElem = data.find(attrs={
                'class': 'zg_hrsr_item'
            }).getText()
            salesRank = salesRankElem.split()[0].replace('#', '').strip()
            if salesRankFrom and salesRankTo:
                return (salesRankFrom <= int(salesRank) <=
                        salesRankTo), salesRank
            else:
                return True, salesRank
        except:
            try:
                product = amazon_uk.lookup(ItemId=unidecode(asin))
                salesRank = product.sales_rank
                if salesRank is not None:
                    return (salesRankFrom <= int(salesRank) <=
                            salesRankTo), salesRank
                else:
                    return True, 'NA'
            except:
                return True, salesRank
Example #12
0
def get_text(filepath,file_name):
    output_dir=os.path.join('/Users/hzxue/output1',filepath.split('/',3)[3])
    if os.path.exists(output_dir) is not True:
        os.makedirs(output_dir)
    read_file_path=os.path.join(filepath,file_name)
    f=open(os.path.join(output_dir,file_name),'w')
    soup=BeautifulSoup(open(read_file_path),'lxml')
    print soup.title.string
    for s in soup("script"):
        soup.extract()
    queue = [([], soup)]  # queue of (path, element) pairs
    while queue:
        path, element = queue.pop(0)
        if hasattr(element, 'children'):  # check for leaf elements
            for child in element.children:
                queue.append((path + [child.name if child.name is not None else type(child)],child))
        if element.string and is_text(element)==1: 
            if is_used(element.string)==1:
                f.write(element.string.strip()+'\n')
def getSalesRank(salesRankFrom, asin, salesRankTo):
    url = 'http://www.amazon.co.uk/dp/' + asin
    mech = Browser()
    page = mech.open(url)
    html = page.read()
    soup = BeautifulSoup(html)
    data = BeautifulSoup.extract(soup)
    salesRankElem = data.find(attrs = {'id':'SalesRank'}).find(attrs = {'class' : 'value'}).getText()
    salesRank =  re.findall(r'\d+\sin', salesRankElem)[0].replace('in','').strip()
    return salesRankFrom <= int(salesRank) <= salesRankTo
Example #14
0
 def get_text(cls, text):
     text = Function.remove_carriage_return(text)
     text = Function.replace_tab_to_space(text)
     text = Function.remove_tags(text, 'script')
     text = Function.remove_tags(text, 'head')
     text = Function.remove_tags(text, 'footer')
     text = Function.remove_tags(text, 'style')
     soup = BeautifulSoup(text, 'html.parser')
     text = soup.extract().get_text()
     text = Function.replace_duplicate_to_space(text)
     return text
Example #15
0
    def parse(self, item, *args, **kwargs):
        self.exporter.fields_to_export = [
            'uuid', 'domain', 'url', 'word', 'word_point', 'date', 'section',
            'pdate'
        ]
        try:
            tm = textmine.textmine()
            soup = BeautifulSoup(item['body'], 'html.parser')
            text = soup.extract().get_text()

            if self.conf.has_option(self.section, 'exclude_keywords'):
                exclude_keywords = ast.literal_eval(
                    self.conf.get(self.section, 'exclude_keywords'))
                for ex_word in exclude_keywords:
                    text = text.replace(ex_word, '')

            tm_result = tm.get(text)

            if len(tm_result) > 0 and len(tm_result[1]) > 0:
                for word in tm_result[1]:
                    new_item = CommonItem()
                    new_item.fields["uuid"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["domain"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["url"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["word"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["word_point"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["date"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["section"] = CommonField()
                    new_item = CommonItem()
                    new_item.fields["pdate"] = CommonField()
                    new_item["encoding"] = item["encoding"]
                    new_item["uuid"] = item["uuid"]
                    new_item["domain"] = item["domain"]
                    new_item["url"] = item["url"]
                    new_item["word"] = word[0]
                    new_item["word_point"] = str(word[1])
                    new_item["date"] = item["date"]
                    new_item["section"] = item["section"]
                    new_item["pdate"] = datetime.datetime.now().strftime(
                        '%Y%m%d%H%M00')

                    if self.start_pdate is None:
                        self.start_pdate = new_item["pdate"]

                    yield new_item
        except Exception as ex:
            print(ex)
Example #16
0
    def parse(self, item, *args, **kwargs):
        try:
            tm = textmine.textmine()
            soup = BeautifulSoup(item['body'], 'html.parser')
            text = soup.extract().get_text()

            if self.conf.has_option(self.section, 'exclude_keywords'):
                exclude_keywords = ast.literal_eval(
                    self.conf.get(self.section, 'exclude_keywords'))
                for ex_word in exclude_keywords:
                    text = text.replace(ex_word, '')

            tm_result = tm.get(text)
            # item.fields["pdate"] = CommonField()
            # item["pdate"] = datetime.datetime.now().strftime('%Y%m%d%H%M00')

            # if self.start_pdate is None:
            # 	self.start_pdate = item["pdate"]

            # item['body'] = str(item['body']).replace('\n', ' ').strip()

            # item.fields["text"] = CommonField()
            # item["text"] = text.replace('\n', ' ').strip()

            item.fields["top_sentence"] = CommonField()
            item.fields["top_word"] = CommonField()
            item.fields["sentences"] = CommonField()
            item.fields["words"] = CommonField()

            if len(tm_result) > 0 and len(tm_result[0]) > 0:
                item["top_sentence"] = str(tm_result[0][0][2]).replace(
                    '\n', ' ').strip()

            if len(tm_result) > 0 and len(tm_result[1]) > 0:
                item["top_word"] = str(tm_result[1][0][0]).replace(
                    '\n', ' ').strip()

            if len(tm_result) > 0:
                item["sentences"] = str(tm_result[0]).replace('\n',
                                                              ' ').strip()
            if len(tm_result) > 1:
                item["words"] = str(tm_result[1]).replace('\n', ' ').strip()

            # self.exporter.fields_to_export = ['uuid', 'domain', 'url', 'keyword', 'top_sentence', 'top_word', 'sentences', 'words', 'text', 'body', 'date', 'section', 'pdate']
            self.exporter.fields_to_export = [
                'uuid', 'domain', 'url', 'keyword', 'top_sentence', 'top_word',
                'sentences', 'words', 'date', 'section'
            ]

            yield item

        except Exception as ex:
            print(ex)
Example #17
0
def clean_data(text, origin_url=None):
    # 文本预清洗
    text = pre_clean(text)

    # 标签清洗
    soup = BeautifulSoup(text, 'lxml')
    soup = clean_attrs(soup)
    soup = clean_tags(soup)
    soup = clean_extra(soup)
    # 对相对地址进行还原
    if origin_url:
        soup = join_url(soup, origin_url)

    # 转换为文本继续处理
    text = str(soup.extract())
    text = clean_text(text)
    return text
Example #18
0
def webcapture(url):
    headers = {
        # pretend I am a browser
        'User-Agent': 'Mozilla/5.0',
    }

    session = requests.Session()  #setup session
    data = session.get(url, headers=headers)  #scrape the data
    soup = BeautifulSoup(data.text, 'html.parser')  #parse the data
    ext = soup.extract()  #return the parsed data
    ii = _remove_attrs(soup)
    #i2 = soup.clear()
    p = soup.find_all('p')
    gettext = soup.get_text()
    tt = ''
    for d in p:
        tt += extract(str(d))
    return tt  #,s.get_text()
Example #19
0
    def getLyrics(singer, song):
        # Replace spaces with _
        singer = singer.replace(' ', '_')
        song = song.replace(' ', '_')
        r = requests.get('http://lyrics.wikia.com/{0}:{1}'.format(
            singer, song))
        s = BeautifulSoup(r.text, features="html.parser")
        if s.find('i') is None:
            # raise ValueError("API could not find the song.")
            return None

        album = str(s.find('i').find('a').text)
        lyrics = s.find("div", {'class': 'lyricbox'})

        if lyrics is None:
            raise ValueError("API could not find the lyrics to the song.")
            return None

        # Remove Scripts
        [s.extract() for s in lyrics('script')]

        # Remove Comments
        comments = lyrics.findAll(text=lambda text: isinstance(text, Comment))
        [comment.extract() for comment in comments]

        # Remove unecessary tags
        for tag in ['div', 'i', 'b', 'a']:
            for match in lyrics.findAll(tag):
                match.replaceWithChildren()

    #Get output as a string and remove non unicode characters and replace\
    #<br> with newlines

        # Python 3
        if sys.version_info.major > 2:
            output = str(lyrics).encode('utf-8', errors='replace')[22:-6:].\
              decode("utf-8").replace('\n','').replace('<br/>','\n')
        else:  # Python 2
            output = str(lyrics)[22:-6:].decode("utf-8").replace('\n','').\
              replace('<br/>','\n')
        try:
            return [album, output]
        except:
            return [album, output.encode('utf-8')]
 def hooker(self):
     """
     hookers gonna hook
     """
     try:
         req = requests.get(DATABASES_URL.format(self.query.lower()),
                            headers=self.headers,
                            proxies=self.proxies)
         soup = BeautifulSoup(req.content, "html.parser")
         self.content = soup.extract()
         results = self._parse_html()
         if results:
             self._find_database_links()
             if len(self.database_links) != 0:
                 return self._download_database()
         else:
             return []
     except Exception:
         return []
Example #21
0
 def downloadAndChangeImgPath(self, html_have_img, newsDate) -> str:
     '''
     :param html_have_img:  新闻的正文的html
     :param newsDate:   新闻的日期(用来做下载图片的文件名)
     :return:  img 中的src修改成下载到本地的地址
     '''
     print("正在下载正文中")
     soup = BeautifulSoup(html_have_img, 'lxml')
     for img in soup.find_all("img"):
         tempSrc = img['src']
         if tempSrc.find("http:") == -1:  # 默认可能漏掉了这部分的
             tempSrc = "http:" + tempSrc
             # time.sleep(1)
         fixedSrc = self.downloadTool.downloadImg(
             img_url=tempSrc, imgName=None, referer=None,
             now_date=newsDate)  # 这个是下面新建力的文件夹,默认都是延迟一天的。
         img['src'] = fixedSrc  # 这个地址放回去
         # 下载,返回path,然后修改。
         print(img['src'])
     print("图片下载并且修改src完成。")
     return [str(soup.extract()).replace("'", '"')]
Example #22
0
def getSKUData(prod, queue):
    data = []
    asin = (prod['asin'])
    image = prod['image']
    title = prod['title']
    price = prod['price_new']
    url = 'http://www.amazon.com/dp/' + unidecode(asin)
    mech = Browser()
    session = requests.Session()
    page = mech.open(url)

    if page.code == 503:
        print '503', page.geturl()
    html = page.read()
    soup = BeautifulSoup(str(html))
    global data
    data = BeautifulSoup.extract((soup))

    # try to get the sales rank else return True. Does not work sometimes. Need to be more robust.
    try:
        salesRankElem = data.find(attrs={
            'id': 'SalesRank'
        }).find(attrs={
            'class': 'value'
        }).getText()
        salesRank = re.findall(r'\n(.*?)\sin', salesRankElem)[0].replace(
            'in', '').replace(',', '').strip()
    except Exception, e:
        try:
            salesRankElem = data.find(attrs={
                'class': 'zg_hrsr_item'
            }).getText()
            salesRank = salesRankElem.split()[0].replace('#', '').strip()
        except:
            try:
                product = amazon.lookup(ItemId=unidecode(asin))
                salesRank = product.sales_rank
            except:
                salesRank = 'NA'
Example #23
0
    def getAddress(self, response):
        sel = scrapy.Selector(response)

        results = sel.xpath("//*[contains(@class, 'media-text')]")[0]
        address = BeautifulSoup(results.extract(), 'html.parser').get_text()
        address = address.replace("A wine and liquor store located in ", "")
        address = address.replace("A store located at ", "")
        if address.find("USA") >= 0:
            addresses = open('addresses.txt', 'a')
            addresses.write("%s\n" % address)

            appname = sel.xpath(
                "//*[contains(@class, 'media-heading text-center')]")[0]
            appname = BeautifulSoup(appname.extract(),
                                    'html.parser').get_text()
            appname = appname.replace('\' mobile app', "")
            appname = appname.replace('\'s mobile app', "")

            appnames = open('appnames.txt', 'a')
            appnames.write("%s\n" % appname)

        else:
            other = open('other.txt', 'a')
            other.write("%s\n" % address)
Example #24
0
linkSite = omgSite.find_all('a')

# <codecell>

saveLinkz = open('htmldoc', 'w')
saveLinkz.write(siteData)
saveLinkz.close()

# <codecell>

openLinkz = open('htmldoc', 'r')
openLinkz.read()

# <codecell>

print omgSite.extract()

# <codecell>

print omgSite.setup

# <codecell>

print omgSite.title

# <codecell>

print omgSite.wrap

# <codecell>
Example #25
0
    def get_item(self, response):
        title = response.css('#productTitle::text').extract_first()
        title = title.strip() if title is not None else None
        # print(title)

        details_output = dict()
        ASIN = None

        details = response.css('#detail-bullets .content > ul > li')

        for detail in details:
            detail_name = detail.css('b::text').extract_first()
            detail_name = detail_name.replace(':', '').strip()

            detail = BeautifulSoup(detail.extract(), 'lxml')

            # Remove detail name's tag in each detail
            for span in detail.find_all('b'):
                span.extract()
            detail = Selector(text=str(detail))

            detail_values = detail.css('li ::text').extract()
            detail_values = utils.normalize_str_array(detail_values)

            detail_value = detail_values[0] if len(detail_values) > 0 else None

            # Parse ranks number
            if 'Amazon Best Sellers Rank' in detail_name:
                detail_value = detail_value.strip().split(' ')[0]
                detail_value = utils.parse_int(detail_value)

            if 'ASIN' in detail_name:
                ASIN = detail_value

            details_output[detail_name] = detail_value

        alt_images = response.css('#altImages img::attr(src)').extract()
        # print(alt_images)

        brand = response.css('#bylineInfo::text').extract_first()
        # print(brand)

        brand_url = response.css('#bylineInfo::attr(href)').extract_first()
        brand_url = response.urljoin(
            brand_url) if brand_url is not None else None
        # print(brand_url)

        price = response.css(
            '.snsPriceBlock .a-color-price::text').extract_first()
        if price is None:
            price = response.css('#priceblock_ourprice::text').extract_first()

        price = price.strip() if price is not None else None
        # print(price)

        description = response.css(
            '#productDescription p::text, #productDescription h3::text'
        ).extract()
        description = utils.normalize_str_array(description)
        # description = '\n'.join(description)
        # print(description)

        plus_desc = response.css('#aplus')
        plus_desc_html = plus_desc.css('.aplus-v2').extract_first()

        plus_desc_texts = plus_desc.css(
            '*:not(script):not(style)::text').extract()
        plus_desc_texts = utils.normalize_str_array(plus_desc_texts)
        plus_desc_text = '\n'.join(plus_desc_texts)

        features = response.css('#feature-bullets ul li ::text').extract()
        features = [feature.strip() for feature in features]
        # print(features)

        videos = response.css(
            '#vse-rel-videos-carousel .vse-video-item::attr(data-video-url)'
        ).extract()
        # print(videos)

        return {
            'ASIN': ASIN,
            'url': response.url,
            'title': title,
            'brand': {
                'name': brand,
                'url': brand_url
            },
            'alt_images': alt_images,
            'details': details_output,
            'price': price,
            'description': description,
            'plus_description': {
                'text': plus_desc_text,
                'html': plus_desc_html
            },
            'features': features,
            'videos': videos,
        }
Example #26
0
class SimpleRender(object):
    def __init__(self, style="default"):
        self.__md = ""
        self.__soup = None

    def __soup_append_style(self, css_select, style):
        for cs in css_select.split(","):
            for i in self.__soup.select(cs.strip()):
                i["style"] = i.get("style", "") + style

    def __soup_render_table_row(self):
        for table in self.__soup.select(".markdown-body table"):
            i = 0
            for row in table.select("tbody tr"):
                if i % 2 == 0:
                    row["style"] = row.get("style", "") + \
                        "background-color:#f8f8f8;"
                i += 1

    def render(self):
        html = markdown.markdown(
            self.__md,
            extensions=[exttables.TableExtension()])
        html = "<div class=\"markdown-body\">%s</div>" % html
        self.__soup = BeautifulSoup(html)
        self.__soup_append_style(
            ".markdown-body",
            ('font-family:"Helvetica Neue",Helvetica,"Segoe UI",Arial,'
             'freesans,sans-serif,"Apple Color Emoji","Segoe UI Emoji",'
             '"Segoe UI Symbol";'
             'font-size:16px;'
             'line-height:1.6;'
             'word-wrap:break-word;'
             'width:1280px;'
             'margin:auto;'
             'color:#333;')
        )
        self.__soup_append_style(
            (".markdown-body p,"
             ".markdown-body blockquote,"
             ".markdown-body ul,"
             ".markdown-body ol,"
             ".markdown-body dl,"
             ".markdown-body table,"
             ".markdown-body pre"),
            'margin-top:0;margin-bottom:16px;'
        )
        self.__soup_append_style(
            "h1, h2, h3, h4",
            ("margin-top:1em;"
             "margin-bottom:16px;"
             "font-weight:bold;"
             "line-height:1.4;")
        )
        self.__soup_append_style(
            ".markdown-body h1",
            ("padding-bottom:0.3em;"
             "font-size:2.25em;"
             "line-height:1.2;"
             "border-bottom:1px solid #eee;")
        )
        self.__soup_append_style(
            ".markdown-body h2",
            ("padding-bottom:0.3em;"
             "font-size:1.75em;"
             "line-height:1.225;"
             "border-bottom:1px solid #eee;")
        )
        self.__soup_append_style(
            ".markdown-body h3",
            ("font-size:1.5em;"
             "line-height:1.43;")
        )
        self.__soup_append_style(".markdown-body h4", "font-size:1.25em;")
        self.__soup_append_style(
            ".markdown-body table",
            ("display:block;"
             "width:100%;"
             "overflow:auto;"
             "word-break:normal;"
             "word-break:keep-all;"
             "border-spacing:0;"
             "border-collapse:collapse;")
        )
        self.__soup_append_style(
            ".markdown-body table th, .markdown-body table td",
            "padding:6px 13px;border:1px solid #ddd;"
        )
        self.__soup_append_style(
            ".markdown-body table th",
            "font-weight:bold;"
        )
        self.__soup_append_style(
            ".markdown-body ul",
            "padding-left:2em;"
        )
        self.__soup_append_style(
            ".markdown-body code",
            ('font-family:Consolas,"Liberation Mono",Menlo,Courier,monospace;'
             "padding:0.2em 0.4em;"
             "margin:0;"
             "font-size:85%;"
             "background-color:rgba(0,0,0,0.04);"
             "border-radius:3px;")
        )
        self.__soup_append_style(
            ".markdown-body span.red",
            ("color:#f33;"
             "padding:0.2em 0.4em;"
             "margin:0;"
             "background-color:rgba(200,32,32,0.15);"
             "border-radius:3px;")
        )
        self.__soup_render_table_row()
        return self.__soup.extract()

    def add_header1(self, header=""):
        self.__md += ("# %s\n" % header)

    def add_header2(self, header=""):
        self.__md += ("## %s\n" % header)

    def add_header3(self, header=""):
        self.__md += ("### %s\n" % header)

    def add_header4(self, header=""):
        self.__md += ("#### %s\n" % header)

    def add_text(self, text=""):
        self.__md += ("\n%s\n\n" % text)

    def add_table(self, theader=[], trows=[{}], align=None):
        table = "|%s|\n" % "|".join(theader)
        style = {"c": ":---:", "l": ":---", "r": "---:"}
        if align is None:
            table += "|%s|\n" % "|".join(["---"] * len(theader))
        else:
            table += "|%s|\n" % "|".join([style.get(a, "---")
                                          for a in align])
        for row in trows:
            l = []
            for h in theader:
                l.append(str(row.get(h, " ")))
            table += "|%s|\n" % "|".join(l)
        self.__md += (table + "\n")

    def add_list(self, text_list=[]):
        def f(ll, wcnt):
            text = ""
            for l in ll:
                if isinstance(l, tuple):
                    text += ("%s* %s\n" % (" " * wcnt, l[0]))
                    text += f(l[1], wcnt + 4)
                else:
                    text += ("%s* %s\n" % (" " * wcnt, l))
            return text
        self.__md += f(text_list, 0)

    def add_md_text(self, text):
        self.__md += text
Example #27
0
                salesRank = product.sales_rank
            except:
                salesRank = 'NA'
    Main_Image = data.find(attrs={'id': 'imgTagWrapperId'}).find('img')['src']
    Num_Images = len(
        data.find(attrs={
            'id': 'altImages'
        }).findAll(attrs={'class': 'a-spacing-small item'}))
    Title = title
    Price = price
    Average_Customer_Review = data.find(
        attrs={'class': 'reviewCountTextLinkedHistogram noUnderline'})['title']
    page1 = mech.open('http://www.amazon.com/product-reviews/' + asin)
    html1 = page1.read()
    soup1 = BeautifulSoup(str(html1))
    data1 = BeautifulSoup.extract((soup1))
    Ratings = data1.findAll(attrs={'class': 'a-histogram-row'})
    Stars_and_numbers = {}
    for rating in Ratings:
        key = rating.findAll(
            attrs={'class': 'a-text-right aok-nowrap'})[0].text
        value = s.findAll(attrs={'class': 'a-text-right aok-nowrap'})[1].text
        Stars_and_numbers[key] = value

    #Stars_and_numbers = data.find(attrs = {'class':'a-icon-alt'}).getText()
    Product_Link = url
    product_0 = amazon.lookup(ItemId=asin)
    Description = product_0.editorial_review
    features = data.find(attrs={'id': 'feature-bullets'}).findAll('li')
    Feature1 = features[1].getText()
    Feature2 = features[2].getText()
Example #28
0
#coding: UTF-8
from readability.readability import Document
from bs4 import BeautifulSoup
import html2text
import urllib
import codecs

f = open('input.html')
htmldata = f.read()

readable_article = Document(htmldata).summary()
readable_title = Document(htmldata).short_title()

soup = BeautifulSoup(readable_article, "lxml")
soup.get_text()
soup.find("img")
soup.find("a")
soup.extract()
readable_article = soup.get_text()

f = codecs.open('output.html', 'w', 'utf-8')
f.write(readable_title)
f.write(readable_article)

readable_article = html2text.html2text(readable_article, )

f = codecs.open('output.txt', 'w', 'utf-8')
f.write(readable_title)
f.write(readable_article)
f.close()
Example #29
0
# using this header to prevent mod security error
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:55.0)'
}

url = "http://www.clivebanks.co.uk/X-Files%20Timeline.htm"
site_url = "/".join(url.split("/")[:3])

response = requests.get(url, headers=headers)

# Obtaining the front page as writting it on disk instead memory.
with open("front_page.html", mode="wb") as file:
    file.write(response.content)

# Getting the links of all front page
with open("front_page.html", mode="rb") as file:
    soup = BeautifulSoup(file, features="html.parser")

# Looping through links of season one.
urls = []
for a in soup.find_all("a", href=True):
    urls.append(site_url + "/" + a["href"])
    if a["href"] == "X-Files/Truth.htm":
        break
# Saving out the text file to disk as a dataset.
with open("x_files_dataset_new.txt", mode="w") as file:
    for url_ in urls[1:]:
        response = requests.get(url_, headers=headers)
        soup = BeautifulSoup(response.content, features="html.parser")
        file.write(soup.extract().text)
import mechanize
from mechanize import Browser
from bs4 import BeautifulSoup

url = 'http://www.amazon.co.uk/dp/B00NVDNDUW'

mech = Browser()
page = mech.open(url)
html = page.read()
soup = BeautifulSoup(html)
#data = BeautifulSoup.extract(soup)

def extract(soup):
    table = soup.find("table",attrs={'id':'ctl00_TemplateBody_WebPartManager1_gwpste_container_SearchForm_ciSearchForm_RTable'})
    #print table
    data = []
    for row in table.findAll("tr"):
        s = row.getText()
        data.append(s)
    salesRankElem = data.find(attrs = {'id':'SalesRank'}).find(attrs = {'class' : 'value'}).getText()
    salesRank =  re.findall(r'\d+\sin', salesRankElem)[0].replace('in','').strip()
    return int(salesRank)

data = BeautifulSoup.extract(soup)
Example #31
0
for serie in series[3:]:
    for episode in range(serie[1],serie[2]):
        if len(str(episode)) < serie[3]: # Enterprise naming nonsense. God, the inconsistency
            episode = "0" * (serie[3] - len(str(episode))) + str(episode)

        url = "http://chakoteya.net/" + serie[0] + str(episode) + ".htm" #49.htm
        response = requests.post(url, data="{Test data}")
        #print(response)
        if response.status_code == 200:
            soup = BeautifulSoup(response.text, "html.parser")
            #print(soup)
            #print(soup.find_all('a'))
            time.sleep(.01)

            directory = "./" + serie[0]
            if not os.path.exists(directory):
                os.makedirs(directory)

            file = open("./" + serie[0] + str(episode) + ".txt", 'w+')

            lines = soup.extract().text

            #lines = response.text.replace("\n", " ").replace("\r", " ").replace("  ", " ")
            #print(lines)
            #lines = re.findall(": (.*?)" + re.escape("<br>"), lines)
            #print(lines[:60])#
            file.write("".join(lines[75:-260]))
            file.close()
        else:
            print("Missed: ", url)
    print("Through", serie[0])
def getText(link, keywords, haveKeyword):
    print("---", link)
    try:
        r = requests.get(link, verify=False, timeout=5)
    except:
        return []
    b = BeautifulSoup(r.text, "lxml")
    [b.extract() for b in b(['script', 'link', 'meta', 'style'])]
    # assume the text will only be in p and div
    text = [b.extract() for b in b(['p', 'div'])]

    for excerpt in text:  # iterate through each para/div
        # Change newline tags to sentences
        found = excerpt.get_text(separator=" ").strip().replace(
            "\xa0", ". ").replace("\n",
                                  ". ").replace("\t",
                                                ". ").replace("\r",
                                                              ". ").strip()
        while "  " in found:  # Remove double spaces
            found = found.replace("  ", " ")
        if found.count("/") > 2:  # likely a link
            continue

        words = found.split(" ")
        if len(words) > 15:
            # Time to add it in
            # Check if it has the name of the product
            foundCleaned = [word for word in words if word not in stops]
            uniqueWords = set(foundCleaned)
            maxCount = 0
            for word in uniqueWords:
                maxCount = max(
                    maxCount, found.count(word)
                )  # count the number of unique words (excluding stopwords)
            if maxCount / len(foundCleaned) > 0.15:
                # ignored.append(found)
                continue
                #pass
            elif len(foundCleaned) / len(words) > 0.73:
                # Too many unique words
                continue
            else:
                haveKeywords = any(keyword in found.lower()
                                   for keyword in keywords)
                sentences = re.split('\. |! |; ', found)  #found.split(". ")
                for sentence in sentences:

                    # Remove parentheses
                    while "(" in sentence and ")" in sentence:
                        startBracket = sentence.find("(")
                        endBracket = sentence.find(")")
                        if startBracket > -1 and endBracket > startBracket:
                            sentence = sentence[0:startBracket].strip(
                            ) + " " + sentence[endBracket + 1:].strip()
                        else:
                            break
                        sentence = sentence.strip()

                    # Remove ending punctuation
                    while len(sentence) > 0 and (
                            sentence[len(sentence) - 1] == "."
                            or sentence[len(sentence) - 1] == ","):
                        sentence = sentence[0:len(sentence) - 1]

                    if len(sentence) == 0:
                        continue

                    words = sentence.split(" ")

                    if len(words) > 2:
                        if haveKeywords:
                            if sentence not in haveKeyword:
                                sentenceToAdd = ""
                                for word in words:
                                    if len(word) > 0:
                                        sentenceToAdd += word + " "
                                haveKeyword.append(sentenceToAdd.strip())
html_doc = """
<html>
    <head>
    <title>Titre de votre site</title>
    </head>
    <body>
        <p class="c1 c2">texte a lire1</p>
        <p class="c3">texte a lire2</p>
    </body>
</html>
"""

soup = BeautifulSoup(html_doc)

print soup.extract()

for p in soup.find_all('p'):
    print p.get("class")

print "---------------"

for p in soup.find_all('p'):
    print p
    p.string = "NV T"
    print p

print soup

print "-----------------------"