Exemple #1
0
    def get_url(self, url, save=None, body=None, timeout=1):
        try:
            if body == None:
                method = "get"
            else:
                method = "post"
            if save == None:
                save = save_files
#            if self.islive():
            headers = {
                'Host': '192.168.0.10',
                'Connection': 'Keep-Alive',
                'User-Agent': 'OI.Share v2',
                'Content-Type': 'text/xml',
            }

            page = etree.Element('set')
            page_element = etree.SubElement(page, 'value')
            page_element.text = '2'
            xml_data = etree.tostring(page, pretty_print=True,
                                      encoding=None).decode()

            response = requests_retry_session().get(url,
                                                    headers=headers,
                                                    data=body,
                                                    timeout=timeout)
            txt = response.text
            return self.remove_encoding(txt)
        except requests.RequestException as e:
            raise OMDNotThere(
                "Could not connect to the OM-D camera at 192.168.0.10. This is probably because you're not connected to the camera's wifi network."
            )
Exemple #2
0
 def parse_articles(self,
                    start,
                    end,
                    board,
                    outname='.',
                    count=0,
                    date='/',
                    path='.',
                    timeout=3):
     filename = outname
     if (filename == '.'):
         filename = board + '-' + str(start) + '-' + str(end) + '-' + str(
             count) + '.json'
     filename = os.path.join(path, filename)
     self.store(filename, u'{"articles": [', 'w')
     first = True
     for i in range(end - start + 1):
         index = start + i
         print('Processing index:', str(index))
         resp = requests_retry_session().get(url=self.PTT_URL + '/bbs/' +
                                             board + '/index' + str(index) +
                                             '.html',
                                             cookies={'over18': '1'},
                                             verify=VERIFY,
                                             timeout=timeout)
         if resp.status_code != 200:
             print('invalid url:', resp.url)
             continue
         soup = BeautifulSoup(resp.text, 'html.parser')
         divs = soup.find_all("div", "r-ent")
         for div in divs:
             c = div.find('div', class_='nrec')
             if not self.filter_by_count(count, c.get_text()):
                 continue
             c = div.find('div', class_='meta').find('div', class_='date')
             if date not in c.get_text():
                 continue
             try:
                 # ex. link would be <a href="/bbs/PublicServan/M.1127742013.A.240.html">Re: [問題] 職等</a>
                 href = div.find('a')['href']
                 link = self.PTT_URL + href
                 article_id = re.sub('\.html', '', href.split('/')[-1])
                 parsed = self.parse(link, article_id, board)
                 if not first:
                     self.store(filename, ',\n', 'a')
                 else:
                     first = False
                 self.store(filename, parsed, 'a')
             except:
                 pass
         time.sleep(0.1)
     self.store(filename, u']}', 'a')
     return filename
Exemple #3
0
 def getLastPage(board, timeout=3):
     content = requests_retry_session().get(
         url='https://www.ptt.cc/bbs/' + board + '/index.html',
         cookies={
             'over18': '1'
         },
         timeout=timeout).content.decode('utf-8')
     first_page = re.search(
         r'href="/bbs/' + board + '/index(\d+).html">&lsaquo;', content)
     if first_page is None:
         return 1
     return int(first_page.group(1)) + 1
Exemple #4
0
 def islive(self):
     try:
         requests_retry_session().get("http://" + self.ip + "/", timeout=1)
         return True
     except requests.RequestException as e:
         return False
Exemple #5
0
import requests
import sys
import time

sys.path.append("C:/Users/Tag Livros/python-libs/functions")

from requests_retry_session import requests_retry_session  # This from must be after "sys.path.append"

# Teste 1
print("Teste 1")
response = requests_retry_session().get('https://www.peterbe.com/')
print(response.status_code)

s = requests.Session()
s.auth = ('user', 'pass')
s.headers.update({'x-test': 'true'})

response = requests_retry_session(session=s).get('https://www.peterbe.com')

# Teste 2
print("\nTeste 2")
t0 = time.time()
try:
    response = requests_retry_session().get('http://localhost:9999', )
except Exception as x:
    print('It failed :(', x.__class__.__name__)
else:
    print('It eventually worked', response.status_code)
finally:
    t1 = time.time()
    print('Took', t1 - t0, 'seconds')
Exemple #6
0
    def parse(link, article_id, board, timeout=3):
        print('Processing article:', article_id)
        resp = requests_retry_session().get(url=link,
                                            cookies={'over18': '1'},
                                            verify=VERIFY,
                                            timeout=timeout)
        if resp.status_code != 200:
            print('invalid url:', resp.url)
            return json.dumps({"error": "invalid url"},
                              sort_keys=True,
                              ensure_ascii=False)
        soup = BeautifulSoup(resp.text, 'html.parser')
        main_content = soup.find(id="main-content")
        metas = main_content.select('div.article-metaline')
        author = ''
        title = ''
        date = ''
        if metas:
            author = metas[0].select(
                'span.article-meta-value')[0].string if metas[0].select(
                    'span.article-meta-value')[0] else author
            title = metas[1].select(
                'span.article-meta-value')[0].string if metas[1].select(
                    'span.article-meta-value')[0] else title
            date = metas[2].select(
                'span.article-meta-value')[0].string if metas[2].select(
                    'span.article-meta-value')[0] else date

            # remove meta nodes
            for meta in metas:
                meta.extract()
            for meta in main_content.select('div.article-metaline-right'):
                meta.extract()

        # remove and keep push nodes
        pushes = main_content.find_all('div', class_='push')
        for push in pushes:
            push.extract()

        try:
            ip = main_content.find(text=re.compile(u'※ 發信站:'))
            ip = re.search('[0-9]*\.[0-9]*\.[0-9]*\.[0-9]*', ip).group()
        except:
            ip = "None"

        # 移除 '※ 發信站:' (starts with u'\u203b'), '◆ From:' (starts with u'\u25c6'), 空行及多餘空白
        # 保留英數字, 中文及中文標點, 網址, 部分特殊符號
        filtered = [
            v for v in main_content.stripped_strings
            if v[0] not in [u'※', u'◆'] and v[:2] not in [u'--']
        ]
        expr = re.compile(
            u(r'[^\u4e00-\u9fa5\u3002\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\s\w:/-_.?~%()]'
              ))
        for i in range(len(filtered)):
            filtered[i] = re.sub(expr, '', filtered[i])

        filtered = [_f for _f in filtered if _f]  # remove empty strings
        filtered = [x for x in filtered if article_id not in x
                    ]  # remove last line containing the url of the article
        content = ' '.join(filtered)
        content = re.sub(r'(\s)+', ' ', content)
        # print 'content', content

        # push messages
        p, b, n = 0, 0, 0
        messages = []
        for push in pushes:
            if not push.find('span', 'push-tag'):
                continue
            push_tag = push.find('span', 'push-tag').string.strip(' \t\n\r')
            push_userid = push.find('span',
                                    'push-userid').string.strip(' \t\n\r')
            # if find is None: find().strings -> list -> ' '.join; else the current way
            push_content = push.find('span', 'push-content').strings
            push_content = ' '.join(push_content)[1:].strip(
                ' \t\n\r')  # remove ':'
            push_ipdatetime = push.find(
                'span', 'push-ipdatetime').string.strip(' \t\n\r')
            messages.append({
                'push_tag': push_tag,
                'push_userid': push_userid,
                'push_content': push_content,
                'push_ipdatetime': push_ipdatetime
            })
            if push_tag == u'推':
                p += 1
            elif push_tag == u'噓':
                b += 1
            else:
                n += 1

        # count: 推噓文相抵後的數量; all: 推文總數
        message_count = {
            'all': p + b + n,
            'count': p - b,
            'push': p,
            'boo': b,
            "neutral": n
        }

        # print 'msgs', messages
        # print 'mscounts', message_count

        # json data
        data = {
            'url': link,
            'board': board,
            'article_id': article_id,
            'article_title': title,
            'author': author,
            'date': date,
            'content': content,
            'ip': ip,
            'message_count': message_count,
            'messages': messages
        }
        # print 'original:', d
        return json.dumps(data, sort_keys=True, ensure_ascii=False)