Ejemplo n.º 1
0
def craw_out_time(i, j, url):
    """ connect server  database    """

    con = mdb.connect('localhost', 'root', '1234', 'hackday', charset='utf8')
    cur = con.cursor()

    target = urllib.urlopen(url)
    soup = Soup(target)
    html = select(soup, '.bd-container div.row')
    for segment in html:
        store = select(segment, "a")[0].string
        if store in white_list:
            print store
            times = select(soup, '.mtcontainer span')
            for sec in times:
                second = time2sec(sec.string)
                #print insert('movie_time',{"movie_id":i,"theater":store,"area":j,"seconds":second})
                cur.execute(
                    insert(
                        'movie_time', {
                            "movie_id": i,
                            "theater": store,
                            "area": j,
                            "seconds": second
                        }))
                #print "[{0}:{1}:{2}]".format(i,j,second)
                # i movie yahoo id
                # j area theater
                # second time
            con.commit()
Ejemplo n.º 2
0
def get_cross_data():
    film_list = []
    area_list = []
    target = urllib.urlopen("http://tw.m.yahoo.com/w/twmovie/schedulesearch_bp.php?movieId=4883&areaId=6&__submit=%E6%9F%A5%E8%A9%A2")
    soup = Soup(target)
    options = select(soup, "select[name=movieId] option")
    for option in options:
        attrs = option.attrs
        try:
            for attr in attrs:
                if str(attr[0]) == 'value':
                    film_list.append(int(attr[1]))
        except:
            pass
    areas = select(soup, "select[name=areaId] option")
    for area in areas:
        attrs = area.attrs
        try:
            for attr in attrs:
                if str(attr[0]) == 'value':
                    area_list.append(int(attr[1]))
        except:
            pass
    film_list = [4883, 4954, 4951, 4851, 4948, 4731, 4924, 4913, 4925, 4908, 4886, 4946, 4912, 4936, 4901, 4903, 4915, 4850, 4940, 4941, 4919, 4904, 4899, 4833, 4878, 4875, 4911, 4807, 4914, 4768, 4891, 4837, 4887, 4868, 4839, 4848, 4803, 4828, 4869, 4855, 4844, 4863, 4865, 4861, 4822, 4818, 4808, 4767, 4753, 4812, 4700, 4778, 4733, 4584, 4687, 4765, 4769, 4789, 4723, 4526, 4772, 4648, 4587, 4796, 4223, 4694, 4365, 4521, 4470, 3941, 3026, 2924]
    #area_list =     [18, 16, 20, 22, 19, 13, 21, 10, 17, 11, 12, 14, 23]
    area_list =     [20,6,2,10]
    return film_list, area_list
Ejemplo n.º 3
0
    def crawl(self):
	self.all_phrasals = []
	for site in self.sites:
	    html = requests.get(site).text
	    soup = Soup(html)
	    selected = select(soup, '.divbox a strong')
	    phrasals = [s.contents[0] for s in selected]
	    self.all_phrasals += phrasals
Ejemplo n.º 4
0
def craw_a_movie(url):
    fd = urllib.urlopen(url)
    soup = Soup(fd)
    data = {}
    raw = select(soup, '.dta')
    data['open'] = raw[0].string
    data['class'] = raw[1].string
    data['long'] = raw[2].string
    data['director'] = raw[3].string
    data['cast'] = raw[4].string
    for tmp in select(soup, '.border img')[0].attrs:
        if tmp[0] == 'src':
            data['cover'] = str(tmp[1])
            break
    raw = select(soup, '#ymvs .bd .full p')
    data['description'] = re.sub("<.*?>","",str(raw))
    return data
Ejemplo n.º 5
0
 def crawl(self):
     self.all_phrasals = []
     for site in self.sites:
         html = requests.get(site).text
         soup = Soup(html)
         selected = select(soup, '.divbox a strong')
         phrasals = [s.contents[0] for s in selected]
         self.all_phrasals += phrasals
Ejemplo n.º 6
0
def craw_a_movie(url):
    fd = urllib.urlopen(url)
    soup = Soup(fd)
    data = {}
    raw = select(soup, '.dta')
    data['open'] = raw[0].string
    data['class'] = raw[1].string
    data['long'] = raw[2].string
    data['director'] = raw[3].string
    data['cast'] = raw[4].string
    for tmp in select(soup, '.border img')[0].attrs:
        if tmp[0] == 'src':
            data['cover'] = str(tmp[1])
            break
    raw = select(soup, '#ymvs .bd .full p')
    data['description'] = re.sub("<.*?>", "", str(raw))
    return data
Ejemplo n.º 7
0
    def create(cls, user, **data):
        data['lvano'] = ''.join([x for x in data['lvano'] if x.isalnum()])

        url = 'https://tiss.tuwien.ac.at/course/courseDetails.xhtml?courseNr='+data['lvano']
        url_de = url + '&locale=de'
        url_en = url + '&locale=en'
        soup_de = Soup(urllib.urlopen(url_de))
        soup_en = Soup(urllib.urlopen(url_en))

        remove_spans = lambda x: x.text.replace(select(x, 'span')[0].text,'').strip()
        get_name = lambda s: remove_spans([x for x in select(s, 'h1') if x.text][0])
        data['lvaname_de'] = get_name(soup_de)
        data['lvaname_en'] = get_name(soup_en)
        data['lvaname_import'] = data['lvaname'].strip()
        if data['lvaname'] not in (data['lvaname_de'],data['lvaname_en'], ):
            print data['lvaname'], 'is not the same as', data['lvaname_de'], data['lvaname_en']
            return
        del data['lvaname']
        try:
            x = Certificate.objects.get(lvano=data['lvano'], user=user)
            print x, 'cert already exists'
        except Certificate.DoesNotExist:
            return Certificate.objects.create(user=user, **data)
Ejemplo n.º 8
0
def craw_out_time(i, j, url):
    """ connect server  database    """

    con = mdb.connect('localhost', 'root', '1234', 'hackday',charset='utf8') 
    cur = con.cursor() 

    target = urllib.urlopen(url)
    soup = Soup(target)
    html = select(soup, '.bd-container div.row')
    for segment in html:
        store = select(segment, "a")[0].string
        if  store in white_list:
            print store
            times = select(soup, '.mtcontainer span')
            for sec in times:
                second = time2sec(sec.string)
                #print insert('movie_time',{"movie_id":i,"theater":store,"area":j,"seconds":second})
                cur.execute(insert('movie_time',{"movie_id":i,"theater":store,"area":j,"seconds":second}))
                #print "[{0}:{1}:{2}]".format(i,j,second)
                # i movie yahoo id
                # j area theater
                # second time
            con.commit()
Ejemplo n.º 9
0
def get_cross_data():
    film_list = []
    area_list = []
    target = urllib.urlopen(
        "http://tw.m.yahoo.com/w/twmovie/schedulesearch_bp.php?movieId=4883&areaId=6&__submit=%E6%9F%A5%E8%A9%A2"
    )
    soup = Soup(target)
    options = select(soup, "select[name=movieId] option")
    for option in options:
        attrs = option.attrs
        try:
            for attr in attrs:
                if str(attr[0]) == 'value':
                    film_list.append(int(attr[1]))
        except:
            pass
    areas = select(soup, "select[name=areaId] option")
    for area in areas:
        attrs = area.attrs
        try:
            for attr in attrs:
                if str(attr[0]) == 'value':
                    area_list.append(int(attr[1]))
        except:
            pass
    film_list = [
        4883, 4954, 4951, 4851, 4948, 4731, 4924, 4913, 4925, 4908, 4886, 4946,
        4912, 4936, 4901, 4903, 4915, 4850, 4940, 4941, 4919, 4904, 4899, 4833,
        4878, 4875, 4911, 4807, 4914, 4768, 4891, 4837, 4887, 4868, 4839, 4848,
        4803, 4828, 4869, 4855, 4844, 4863, 4865, 4861, 4822, 4818, 4808, 4767,
        4753, 4812, 4700, 4778, 4733, 4584, 4687, 4765, 4769, 4789, 4723, 4526,
        4772, 4648, 4587, 4796, 4223, 4694, 4365, 4521, 4470, 3941, 3026, 2924
    ]
    #area_list =     [18, 16, 20, 22, 19, 13, 21, 10, 17, 11, 12, 14, 23]
    area_list = [20, 6, 2, 10]
    return film_list, area_list
Ejemplo n.º 10
0
    def parse_from_curriculum_url(cls,url):
        soup = Soup(urllib.urlopen(url))

        levelindex = 0
        previous_levelindex = -1
        curriculum = None
        stack = []
        obj = None

        if 'errorPage' in soup:
            print 'error page!'
            return

        titles = [x.text for x in select(soup, 'h1') if x.text]


        currno = ''.join(titles[0].strip().split(' ',2)[:2])
        name = ''.join(titles[0].strip().split(' ',2)[2:])


        lang = dict(select(soup, 'body')[0].attrs)['class'].replace('lehre','').strip()

        if lang!='de':
            print 'import not german'
            return

        for x in select(soup, '#nodeTable tbody tr'):
            data = select(x, 'td')
            title = data[0]
            (smst, ects) = data[-2:]

            ects = ects.text
            if ects:
                ects = Decimal(ects)

            titlelevel = dict(select(title, 'div')[0].attrs)['class'].replace('nodeTable-level-','')

            levelindex = int(titlelevel[0], 10)

            if previous_levelindex > levelindex and stack and levelindex<3:
                number = previous_levelindex if previous_levelindex < 4 else 3
                for x in range(levelindex, number):
                    stack.pop()

            if levelindex == 0:
                curriculum, created = Curriculum.objects.get_or_create(name=name, number=currno)
                if not created:
                    curriculum.trees.all().delete()
            if levelindex == 1:
                name = select(title, 'span')[0].text
                type_ = title.text[:-len(name)]
                if title.text.startswith('Curriculum Suppleme'):
                    print type_
                    break
                lvatree = LVATree.objects.create(treetype=type_, name=name, min_ects=ects if ects else 0)
                if stack:
                    stack[-1].subtrees.add(lvatree)
                else:
                    curriculum.trees.add(lvatree)
                stack.append(lvatree)
            if levelindex == 2:
                name = select(title, 'span')[0].text
                type_ = title.text[:-len(name)]
                lvatree = LVATree.objects.create(treetype=type_, name=name, min_ects=ects if ects else 0)
                if stack:
                    stack[-1].subtrees.add(lvatree)
                else:
                    curriculum.trees.add(lvatree)
                stack.append(lvatree)
            if levelindex == 3:
                courseTitle = select(title, 'span')[0].text.strip()
                courseType = title.text.strip()[:2]
                if not ects:
                    print 'ignored: ', courseTitle
                    continue
                lva = LVA.objects.create(lvatype = courseType, ects=ects, name_de = courseTitle)
                stack[-1].lvas.add(lva)
            if levelindex == 4:
                pass

            previous_levelindex = levelindex
        return curriculum
Ejemplo n.º 11
0
def movie_link_parser(file_name,list):
    file_ob = urllib.urlopen(file_name)
    soup = Soup(file_ob)
    for out in select(soup, '.bd.vlist h4 a'):
        list[out.string.encode("utf-8")] = out['href']
Ejemplo n.º 12
0
def movie_link_parser(file_name, list):
    file_ob = urllib.urlopen(file_name)
    soup = Soup(file_ob)
    for out in select(soup, '.bd.vlist h4 a'):
        list[out.string.encode("utf-8")] = out['href']
Ejemplo n.º 13
0
def craw_a_movie(url):
    fd = urllib.urlopen(url)
    soup = Soup(fd)
    data = {}
    data["location"] = select(soup, 'h1')
    return data
Ejemplo n.º 14
0
def craw_a_movie(url):
    fd = urllib.urlopen(url)
    soup = Soup(fd)
    data = {}
    data["location"] = select(soup, 'h1')
    return data