Esempio n. 1
0
                    [trh.extract() for trh in trash]
            except IndexError:
                trash = body.findAll(tag)
                if trash:
                    [trh.extract() for trh in trash]
        comments = body.findAll(text=lambda text:isinstance(text, Comment))
        [comment.extract() for comment in comments]
        return body

    # получаем настройки
    def getSetings(self):
        if not 'http://' in self.url: self.url = 'http://' + self.url
        n = self.url.replace('www.', '').split('/')[2]
        try:
            self.settings = SITES[n]
        except KeyError:
            self.settings = SITES['default']

    # разбиваем id/class:name по двоеточию
    def indent(self, txt):
        return txt.split(':')

#тест на 4х сайтах по 10 урлов с каждого
if __name__ == '__main__':
    from saver import Saver
    for line in open('testLink.txt'):
        line = line.rstrip()
        test = Parser(line)
        f = Saver(line)
        f.saveFile(test.result())
Esempio n. 2
0
# -*- coding: utf-8 -*-
from parser import Parser
from saver import Saver
import sys

try:
    url = sys.argv[1]
except IndexError:
    print 'Не передан URL'
    sys.exit()

obj = Parser(url)
text = obj.result()

f = Saver(url)
f.saveFile(text)