Esempio n. 1
0
def extract_pages_and_next_link(content):
    html = BeautifulSoup(content)
    pages = html.find('div', {'id': 'mw-pages'})
    links = pages.findAll('a')
    titles = []
    last_link = ''
    for link in links:
        text = link.getText()
        title = link.get('title')
        href = link.get('href')
        m = re.match(r'^/wiki/(.*)$', href)
        if m:
            w = m.group(1)
            wu = urllib.unquote(w)
            # print wu
            # print repr(wu)
            # print wu.decode('utf8')
            if text != title:
                raise Exception('Mistake in link: %s' % unicode(link))
            titles.append(text)
        last_link = href
    m = re.match(r'^/w/index\.php\?title=(.*)$', last_link)
    if not m:
        raise Exception('Cannot find "next" link')
    return last_link, '\n'.join(titles)
Esempio n. 2
0
class Parser:
    def __init__(self, path_in, path_out, name, file_in_dir=None, slug=None):
        self.path_in = path_in
        self.path_out = path_out
        self.name = name
        self.file_in_dir = file_in_dir
        self.dub = slug in ['dmitriev', 'efremova', 'dic_fwords',
                            'dic_synonims']
        self.html = ''
        if file_in_dir:
            self.filename = join(self.path_in, self.name, self.file_in_dir)
        else:
            self.filename = join(self.path_in, self.name)

        self.out_filename = join(self.path_out, self.name)

    def read_file(self):
        content = open(self.filename).read()
        self.html = BeautifulSoup(content)

    def parse_file(self):
        self.h1 = self.dt = self.dd = ''
        try:
            self.h1 = self.html.find('h1').getText()
        except AttributeError, e:
            print '#', self.name, "#", "'h1' not found:", e
            return False
        # try:
        #     dl = self.html.find('dl')
        # except AttributeError, e:
        #     print '#', self.name, "#", "'dl' not found:", e
        #     return False
        # self.dt = dl.find('dt').getText()
        # self.dd = unicode(dl.find('dd'))
        try:
            self.dt = self.html.find('dt').getText()
        except AttributeError, e:
            print '##', self.name, "-", "DT not found:", e
            return False
Esempio n. 3
0
    version = "Opera/9.80 (Windows NT 6.2; WOW64) Presto/2.12.388 Version/12.16"

urllib._urlopener = AppURLopener()


files = os.listdir(path)

start = 0
threads = 1

# for filename in files[len(files) / 2 - 1:0:-1]:
# for filename in files[len(files) / 2:]:
# for filename in files[len(files)-1:0:-1]:
for filename in files[start::threads]:
    content = open(join(path, filename)).read()
    html = BeautifulSoup(content)
    table = html.find('table', {'class': 'mw-allpages-table-chunk'})
    if not table:
        print '# Failed', filename
        continue
    links = table.findAll('a')
    for link in links:
        word = link.getText()
        cls = link.get('class')
        url = link.get('href')
        if edit:
            url += '?action=edit'
        content2 = ''
        ok = False
        dt = datetime.now().strftime("[%H:%M:%S]")
        dt = "(%s) %s" % (start, dt)
Esempio n. 4
0
 def read_file(self):
     content = open(self.filename).read()
     self.html = BeautifulSoup(content)
Esempio n. 5
0
def extract_wiki_text(content):
    html = BeautifulSoup(content)
    return html.find('textarea').getText()