def craw_out_time(i, j, url): """ connect server database """ con = mdb.connect('localhost', 'root', '1234', 'hackday', charset='utf8') cur = con.cursor() target = urllib.urlopen(url) soup = Soup(target) html = select(soup, '.bd-container div.row') for segment in html: store = select(segment, "a")[0].string if store in white_list: print store times = select(soup, '.mtcontainer span') for sec in times: second = time2sec(sec.string) #print insert('movie_time',{"movie_id":i,"theater":store,"area":j,"seconds":second}) cur.execute( insert( 'movie_time', { "movie_id": i, "theater": store, "area": j, "seconds": second })) #print "[{0}:{1}:{2}]".format(i,j,second) # i movie yahoo id # j area theater # second time con.commit()
def get_cross_data(): film_list = [] area_list = [] target = urllib.urlopen("http://tw.m.yahoo.com/w/twmovie/schedulesearch_bp.php?movieId=4883&areaId=6&__submit=%E6%9F%A5%E8%A9%A2") soup = Soup(target) options = select(soup, "select[name=movieId] option") for option in options: attrs = option.attrs try: for attr in attrs: if str(attr[0]) == 'value': film_list.append(int(attr[1])) except: pass areas = select(soup, "select[name=areaId] option") for area in areas: attrs = area.attrs try: for attr in attrs: if str(attr[0]) == 'value': area_list.append(int(attr[1])) except: pass film_list = [4883, 4954, 4951, 4851, 4948, 4731, 4924, 4913, 4925, 4908, 4886, 4946, 4912, 4936, 4901, 4903, 4915, 4850, 4940, 4941, 4919, 4904, 4899, 4833, 4878, 4875, 4911, 4807, 4914, 4768, 4891, 4837, 4887, 4868, 4839, 4848, 4803, 4828, 4869, 4855, 4844, 4863, 4865, 4861, 4822, 4818, 4808, 4767, 4753, 4812, 4700, 4778, 4733, 4584, 4687, 4765, 4769, 4789, 4723, 4526, 4772, 4648, 4587, 4796, 4223, 4694, 4365, 4521, 4470, 3941, 3026, 2924] #area_list = [18, 16, 20, 22, 19, 13, 21, 10, 17, 11, 12, 14, 23] area_list = [20,6,2,10] return film_list, area_list
def crawl(self): self.all_phrasals = [] for site in self.sites: html = requests.get(site).text soup = Soup(html) selected = select(soup, '.divbox a strong') phrasals = [s.contents[0] for s in selected] self.all_phrasals += phrasals
def craw_a_movie(url): fd = urllib.urlopen(url) soup = Soup(fd) data = {} raw = select(soup, '.dta') data['open'] = raw[0].string data['class'] = raw[1].string data['long'] = raw[2].string data['director'] = raw[3].string data['cast'] = raw[4].string for tmp in select(soup, '.border img')[0].attrs: if tmp[0] == 'src': data['cover'] = str(tmp[1]) break raw = select(soup, '#ymvs .bd .full p') data['description'] = re.sub("<.*?>","",str(raw)) return data
def craw_a_movie(url): fd = urllib.urlopen(url) soup = Soup(fd) data = {} raw = select(soup, '.dta') data['open'] = raw[0].string data['class'] = raw[1].string data['long'] = raw[2].string data['director'] = raw[3].string data['cast'] = raw[4].string for tmp in select(soup, '.border img')[0].attrs: if tmp[0] == 'src': data['cover'] = str(tmp[1]) break raw = select(soup, '#ymvs .bd .full p') data['description'] = re.sub("<.*?>", "", str(raw)) return data
def create(cls, user, **data): data['lvano'] = ''.join([x for x in data['lvano'] if x.isalnum()]) url = 'https://tiss.tuwien.ac.at/course/courseDetails.xhtml?courseNr='+data['lvano'] url_de = url + '&locale=de' url_en = url + '&locale=en' soup_de = Soup(urllib.urlopen(url_de)) soup_en = Soup(urllib.urlopen(url_en)) remove_spans = lambda x: x.text.replace(select(x, 'span')[0].text,'').strip() get_name = lambda s: remove_spans([x for x in select(s, 'h1') if x.text][0]) data['lvaname_de'] = get_name(soup_de) data['lvaname_en'] = get_name(soup_en) data['lvaname_import'] = data['lvaname'].strip() if data['lvaname'] not in (data['lvaname_de'],data['lvaname_en'], ): print data['lvaname'], 'is not the same as', data['lvaname_de'], data['lvaname_en'] return del data['lvaname'] try: x = Certificate.objects.get(lvano=data['lvano'], user=user) print x, 'cert already exists' except Certificate.DoesNotExist: return Certificate.objects.create(user=user, **data)
def craw_out_time(i, j, url): """ connect server database """ con = mdb.connect('localhost', 'root', '1234', 'hackday',charset='utf8') cur = con.cursor() target = urllib.urlopen(url) soup = Soup(target) html = select(soup, '.bd-container div.row') for segment in html: store = select(segment, "a")[0].string if store in white_list: print store times = select(soup, '.mtcontainer span') for sec in times: second = time2sec(sec.string) #print insert('movie_time',{"movie_id":i,"theater":store,"area":j,"seconds":second}) cur.execute(insert('movie_time',{"movie_id":i,"theater":store,"area":j,"seconds":second})) #print "[{0}:{1}:{2}]".format(i,j,second) # i movie yahoo id # j area theater # second time con.commit()
def get_cross_data(): film_list = [] area_list = [] target = urllib.urlopen( "http://tw.m.yahoo.com/w/twmovie/schedulesearch_bp.php?movieId=4883&areaId=6&__submit=%E6%9F%A5%E8%A9%A2" ) soup = Soup(target) options = select(soup, "select[name=movieId] option") for option in options: attrs = option.attrs try: for attr in attrs: if str(attr[0]) == 'value': film_list.append(int(attr[1])) except: pass areas = select(soup, "select[name=areaId] option") for area in areas: attrs = area.attrs try: for attr in attrs: if str(attr[0]) == 'value': area_list.append(int(attr[1])) except: pass film_list = [ 4883, 4954, 4951, 4851, 4948, 4731, 4924, 4913, 4925, 4908, 4886, 4946, 4912, 4936, 4901, 4903, 4915, 4850, 4940, 4941, 4919, 4904, 4899, 4833, 4878, 4875, 4911, 4807, 4914, 4768, 4891, 4837, 4887, 4868, 4839, 4848, 4803, 4828, 4869, 4855, 4844, 4863, 4865, 4861, 4822, 4818, 4808, 4767, 4753, 4812, 4700, 4778, 4733, 4584, 4687, 4765, 4769, 4789, 4723, 4526, 4772, 4648, 4587, 4796, 4223, 4694, 4365, 4521, 4470, 3941, 3026, 2924 ] #area_list = [18, 16, 20, 22, 19, 13, 21, 10, 17, 11, 12, 14, 23] area_list = [20, 6, 2, 10] return film_list, area_list
def parse_from_curriculum_url(cls,url): soup = Soup(urllib.urlopen(url)) levelindex = 0 previous_levelindex = -1 curriculum = None stack = [] obj = None if 'errorPage' in soup: print 'error page!' return titles = [x.text for x in select(soup, 'h1') if x.text] currno = ''.join(titles[0].strip().split(' ',2)[:2]) name = ''.join(titles[0].strip().split(' ',2)[2:]) lang = dict(select(soup, 'body')[0].attrs)['class'].replace('lehre','').strip() if lang!='de': print 'import not german' return for x in select(soup, '#nodeTable tbody tr'): data = select(x, 'td') title = data[0] (smst, ects) = data[-2:] ects = ects.text if ects: ects = Decimal(ects) titlelevel = dict(select(title, 'div')[0].attrs)['class'].replace('nodeTable-level-','') levelindex = int(titlelevel[0], 10) if previous_levelindex > levelindex and stack and levelindex<3: number = previous_levelindex if previous_levelindex < 4 else 3 for x in range(levelindex, number): stack.pop() if levelindex == 0: curriculum, created = Curriculum.objects.get_or_create(name=name, number=currno) if not created: curriculum.trees.all().delete() if levelindex == 1: name = select(title, 'span')[0].text type_ = title.text[:-len(name)] if title.text.startswith('Curriculum Suppleme'): print type_ break lvatree = LVATree.objects.create(treetype=type_, name=name, min_ects=ects if ects else 0) if stack: stack[-1].subtrees.add(lvatree) else: curriculum.trees.add(lvatree) stack.append(lvatree) if levelindex == 2: name = select(title, 'span')[0].text type_ = title.text[:-len(name)] lvatree = LVATree.objects.create(treetype=type_, name=name, min_ects=ects if ects else 0) if stack: stack[-1].subtrees.add(lvatree) else: curriculum.trees.add(lvatree) stack.append(lvatree) if levelindex == 3: courseTitle = select(title, 'span')[0].text.strip() courseType = title.text.strip()[:2] if not ects: print 'ignored: ', courseTitle continue lva = LVA.objects.create(lvatype = courseType, ects=ects, name_de = courseTitle) stack[-1].lvas.add(lva) if levelindex == 4: pass previous_levelindex = levelindex return curriculum
def movie_link_parser(file_name,list): file_ob = urllib.urlopen(file_name) soup = Soup(file_ob) for out in select(soup, '.bd.vlist h4 a'): list[out.string.encode("utf-8")] = out['href']
def movie_link_parser(file_name, list): file_ob = urllib.urlopen(file_name) soup = Soup(file_ob) for out in select(soup, '.bd.vlist h4 a'): list[out.string.encode("utf-8")] = out['href']
def craw_a_movie(url): fd = urllib.urlopen(url) soup = Soup(fd) data = {} data["location"] = select(soup, 'h1') return data