Ejemplo n.º 1
0
 def test_parse_without_metalines(self):
     self.link = 'https://www.ptt.cc/bbs/NBA/M.1432438578.A.4B0.html'
     self.article_id = 'M.1432438578.A.4B0'
     self.board = 'NBA'
     jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
     self.assertEqual(jsondata['article_id'], self.article_id)
     self.assertEqual(jsondata['board'], self.board)
Ejemplo n.º 2
0
 def test_parse_with_push_without_contents(self):
     self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1433091897.A.1C5.html'
     self.article_id = 'M.1433091897.A.1C5'
     self.board = 'Gossiping'
     jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
     self.assertEqual(jsondata['article_id'], self.article_id)
     self.assertEqual(jsondata['board'], self.board)
Ejemplo n.º 3
0
def home(request):
    if request.method == 'GET':
        return render(
            request,
            'demo/demo.html',
        )
    elif request.method == 'POST' and request.is_ajax():
        bname = escape(request.POST.get('board_name'))
        aid = escape(request.POST.get('article_id'))
        link = PTT_URL + '/bbs/' + bname + '/' + aid + '.html'
        if not (bname and aid):
            return HttpResponse(json.dumps({'data': {'error': 'invalid url'}, 'link': link}), content_type='application/json')
        if aid.lower() == 'latest' or aid.lower() == 'index':
            resp = requests.get(
                url=PTT_URL + '/bbs/' + bname + '/index.html',
                cookies={'over18': '1'}, verify=False
            )
            if resp.status_code == 200:
                soup = BeautifulSoup(resp.text)
                divs = soup.find_all("div", "r-ent")
                aid = divs[-1].select("div.title > a")[0]['href'].split("/")[3].replace(".html", "")
        link = PTT_URL + '/bbs/' + bname + '/' + aid + '.html'
        data = json.loads(
            crawler.parse(link, aid, bname)
        )
        return HttpResponse(json.dumps({'data': data, 'link': link}), content_type='application/json')
Ejemplo n.º 4
0
def home(request):
    if request.method == 'GET':
        return render(
            request,
            'demo/demo.html',
        )
    elif request.method == 'POST' and request.is_ajax():
        bname = escape(request.POST.get('board_name'))
        aid = escape(request.POST.get('article_id'))
        link = PTT_URL + '/bbs/' + bname + '/' + aid + '.html'
        if not (bname and aid):
            return HttpResponse(json.dumps({
                'data': {
                    'error': 'invalid url'
                },
                'link': link
            }),
                                content_type='application/json')
        if aid.lower() == 'latest' or aid.lower() == 'index':
            resp = requests.get(url=PTT_URL + '/bbs/' + bname + '/index.html',
                                cookies={'over18': '1'},
                                verify=False)
            if resp.status_code == 200:
                soup = BeautifulSoup(resp.text)
                divs = soup.find_all("div", "r-ent")
                aid = divs[-1].select("div.title > a")[0]['href'].split(
                    "/")[3].replace(".html", "")
        link = PTT_URL + '/bbs/' + bname + '/' + aid + '.html'
        data = json.loads(crawler.parse(link, aid, bname))
        return HttpResponse(json.dumps({
            'data': data,
            'link': link
        }),
                            content_type='application/json')
Ejemplo n.º 5
0
    def parse(self, response):
        items = AutoNewsItem()
        items['collect_log_objects'] = []
        items['parsed_news_objects'] = []
        data = eval(response.text)['list']

        if not len(data):
            raise CloseSpider('close it')

        for news in data:
            url = re.sub(r'\/money.*', f'{news["HyperLink"]}?chdtv',
                         response.url)
            html = requests.get(url).text
            crawler_time = datetime.now()
            date, title, article, keywords = crawler.parse(url)
            items['collect_log_objects'].append(
                CollectLog(poster='scrapy',
                           url=url,
                           html=html,
                           collect_time=crawler_time))
            items['parsed_news_objects'].append(
                ParsedNews(url=url,
                           title=title,
                           article=article,
                           keywords=keywords,
                           date=date))
        yield items
Ejemplo n.º 6
0
 def test_parse(self):
     self.link = 'https://www.ptt.cc/bbs/PublicServan/M.1409529482.A.9D3.html'
     self.article_id = 'M.1409529482.A.9D3'
     self.board = 'PublicServan'
     jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
     self.assertEqual(jsondata['article_id'], self.article_id)
     self.assertEqual(jsondata['board'], self.board)
     self.assertEqual(jsondata['message_conut']['count'], 55)
Ejemplo n.º 7
0
 def test_parse_without_metalines(self):
     self.link = 'https://www.ptt.cc/bbs/NBA/M.1432438578.A.4B0.html'
     self.article_id = 'M.1432438578.A.4B0'
     self.board = 'NBA'
     jsondata = json.loads(
         crawler.parse(self.link, self.article_id, self.board))
     self.assertEqual(jsondata['article_id'], self.article_id)
     self.assertEqual(jsondata['board'], self.board)
Ejemplo n.º 8
0
 def test_parse_with_push_without_contents(self):
     self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1433091897.A.1C5.html'
     self.article_id = 'M.1433091897.A.1C5'
     self.board = 'Gossiping'
     jsondata = json.loads(
         crawler.parse(self.link, self.article_id, self.board))
     self.assertEqual(jsondata['article_id'], self.article_id)
     self.assertEqual(jsondata['board'], self.board)
Ejemplo n.º 9
0
 def test_parse(self):
     self.link = 'https://www.ptt.cc/bbs/PublicServan/M.1409529482.A.9D3.html'
     self.article_id = 'M.1409529482.A.9D3'
     self.board = 'PublicServan'
     jsondata = json.loads(
         crawler.parse(self.link, self.article_id, self.board))
     self.assertEqual(jsondata['article_id'], self.article_id)
     self.assertEqual(jsondata['board'], self.board)
     self.assertEqual(jsondata['message_conut']['count'], 55)
Ejemplo n.º 10
0
 def test_parse_with_structured_push_contents(self):
     self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1119222660.A.94E.html'
     self.article_id = 'M.1119222660.A.94E'
     self.board = 'Gossiping'
     jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board))
     self.assertEqual(jsondata['article_id'], self.article_id)
     self.assertEqual(jsondata['board'], self.board)
     isCatched = False
     for msg in jsondata['messages']:
         if u'http://tinyurl.com/4arw47s' in msg['push_content']:
             isCatched = True
     self.assertTrue(isCatched)
Ejemplo n.º 11
0
def check_page():
    page = crawl(configuration['targetURL'])  # .decode("utf8")
    page_hash = md5(page)
    c = load()
    if not c['hash'] == page_hash:
        print("HASH CHANGED! (" + page_hash + ")")

        # Run a background thread to archive the page in the web archive
        start_new_thread(crawl, ("https://web.archive.org/save/" +
                                 configuration['targetURL'], False))

        # Check if the file is online and we didn't sent the mail already (if so send it)
        match = parse(page.decode('utf8'))
        if match is not None and not c['mailSent']:
            print(
                "FILE IS ONLINE! Sending mails ... (and we didn't sent them already)"
            )
            docx = crawl(match)
            for person_details in configuration['details']:
                variables = {
                    "name": person_details['name'],
                    "year": person_details['targetYear'],
                    "quarter": person_details['quarter'],
                    "mail": person_details['mail'],
                    "streetAndCity": person_details['streetAndCity'],
                    "phone": person_details['phone'],
                    "matrikelnr": person_details['matrikelnr']
                }
                res = parser.update_document_contents(docx, person_details)
                res_filename = "Antrag Wohnheimzimmer " + variables[
                    'quarter'] + " " + variables['year'] + ".docx"
                mail.send(configuration['mail'], variables, res, res_filename)
            c['mailSent'] = True

        # Send a mail regardless of the above that there is a change
        notification_conf = {
            "body":
            "Something changed! Go and visit " + configuration['targetURL'],
            "subject":
            "IMPORTANT | The watched website has changed! Go check it immediately!",
            "recipient": configuration['mail']['notificationRecipient'],
            "server": configuration['mail']['server']
        }
        if c['mailSent']:
            notification_conf[
                'body'] += "\n\n Oh and btw I already sent your reservation request ;)\n\n Have a good one!\n - AccommodationBot"
        mail.send(notification_conf)

        c['hash'] = page_hash
    else:
        print("Boring old same page...")

    save(c)
Ejemplo n.º 12
0
 def test_parse_with_structured_push_contents(self):
     self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1119222660.A.94E.html'
     self.article_id = 'M.1119222660.A.94E'
     self.board = 'Gossiping'
     jsondata = json.loads(
         crawler.parse(self.link, self.article_id, self.board))
     self.assertEqual(jsondata['article_id'], self.article_id)
     self.assertEqual(jsondata['board'], self.board)
     isCatched = False
     for msg in jsondata['messages']:
         if u'http://tinyurl.com/4arw47s' in msg['push_content']:
             isCatched = True
     self.assertTrue(isCatched)
Ejemplo n.º 13
0
async def collect(news: News):
    collect_log_objects = []
    parsed_news_objects = []
    try:
        for url in news.urls:
            html = requests.get(url).text
            date, title, article, keywords = crawler.parse(url)
            collect_time = datetime.now()
            collect_log_objects.append(
                CollectLog(poster=news.poster,
                           url=url,
                           html=html,
                           collect_time=collect_time))
            parsed_news_objects.append(
                ParsedNews(url=url,
                           title=title,
                           article=article,
                           keywords=keywords,
                           date=date))
        collect_news_to_db(collect_log_objects, parsed_news_objects)
        print(f'{news.urls} collected')
    except Exception as e:
        print(e)
    return {"message": "collected"}
Ejemplo n.º 14
0
def test():
    assert_equals(('Family Guy', '11',  '6'), parse('Family.Guy.S11E06.rest.avi'))
    assert_equals(('Family Guy', '11', '60'), parse('Family.Guy.S11E60.rest.avi'))
    assert_equals(('Family Guy',  '1', '60'), parse('Family.Guy.S01E60.rest.avi'))
    assert_equals(('TheSeaEar' ,  '1', '60'), parse('TheSeaEar.S01E60.rest.avi'))