コード例 #1
0
ファイル: currentset.py プロジェクト: Dudi93/PythonTasks
def parse(fobj):
    '''
    Parse HTML file and extract targets data
    '''

    root = ES.parse(fobj)
    data = []
    tbllst = root.findall('.//table')
    for elem in tbllst[0][0]:  # iterating over table/tbody/tr elements
        if elem.tag == 'tr':
            counter = 1
            temp=[]
            for td in elem:
                if counter == 1:
                    if td.text == "Target name":
                        counter += 1
                        continue
                    temp.append(td.text)
                    counter += 1
                elif counter == 3:
                    temp.append(td.text)
                    counter += 1
                elif counter == 10:
                    if td.text != None:
                        temp.append(td.text)
                    counter += 1
                else:
                    counter += 1
            if len(temp) == 3:
                data.append(temp)
                
    data.sort(key=lambda item: (item[2], item[0]), reverse=False)
                
    return data
コード例 #2
0
def parse(fobj):
    '''
    Parse HTML file and extract targets data
    '''

    root = ES.parse(fobj)
    data = []
    tbllst = root.findall('.//table')
    for elem in tbllst[0][0]:  # iterating over table/tbody/tr elements
        if elem.tag == 'tr':
            counter = 1
            temp = []
            for td in elem:
                if counter == 1:
                    if td.text == "Target name":
                        counter += 1
                        continue
                    temp.append(td.text)
                    counter += 1
                elif counter == 3:
                    temp.append(td.text)
                    counter += 1
                elif counter == 10:
                    if td.text != None:
                        temp.append(td.text)
                    counter += 1
                else:
                    counter += 1
            if len(temp) == 3:
                data.append(temp)

    data.sort(key=lambda item: (item[2], item[0]), reverse=False)

    return data
コード例 #3
0
def parse(fobj):
    '''
    Parse HTML file and extract targets data
    '''

    root = ES.parse(fobj)
    tbllst = root.findall('.//table')
    for elem in tbllst[0][0]:  # iterating over table/tbody/tr elements
        if elem.tag == 'tr':
コード例 #4
0
def get_vcard(uri, lat, long):
    try:
        return vcards[uri]
    except KeyError:
        vcard = vcards[uri] = vCard.objects.get_or_create(uri=uri)[0]
        
    vcard.location = Point(long, lat, srid=4326)
    venue_et = ES.parse(urllib.urlopen(uri))
    vcard.name = [e for e in venue_et.findall('.//div') if e.attrib.get('class')=='heading'][0].text.strip()
    vcard.save()
    return vcard
コード例 #5
0
 def read(cls, tag_str, language):
     tag_str = tag_str.lstrip('*: ')
     t = ElementSoup.parse(StringIO(tag_str), encoding='utf8')
     if t.tag == 'html':
         # Parsing inconsistency with included ampersands
         t = t[0]
     d = dict(t.items())
     d['type'] = t.tag
     if t.text:
         d['description'] = t.text
         d['subtype'] = heuristics.determine_tagtype(
             d['name'] + ' ' + d['description'], language)[1]
     return cls.sanitize(d)
コード例 #6
0
def get_from_link(input_str):
    input_str = input_str.strip()
    if (input_str.count('\n') <= 1 and input_str.startswith('http://')
            and 'action=edit' in input_str and 'wikivoyage' in input_str):
        input_str = fake_agent_readurl(input_str)
        t = ElementSoup.parse(StringIO(input_str))
        if sys.version_info[:2] < (2, 7):
            # Xpath too stupid for bracket syntax,
            # fortunately there seems to be only one
            input_str = t.find(".//textarea").text
        else:
            input_str = t.find(".//textarea[@id='wpTextbox1']").text
        return html_decode(input_str)
    return input_str
コード例 #7
0
ファイル: currentset.py プロジェクト: VolBon/ubuntu
def parse(fobj):
    '''
    Parse HTML file and extract targets data
    '''
    d = list()
    
    root = ES.parse(fobj)
    tbllst = root.findall('.//table')
    for elem in tbllst[0][0]:  # iterating over table/tbody/tr elements
        if elem.tag == 'tr':
            lst = list()
            # elem is 'tr' tag with a few children 'td'
            # iterating over children might look as follows:
            for td in elem:
                lst.append(td.text)
            #print lst
            d.append(lst)
    return d
コード例 #8
0
def parse(fobj):
    '''
    Parse HTML file and extract targets data
    '''

    root = ES.parse(fobj)
    tbllst = root.findall('.//table')
    dictionary = {}
    lister = [dictionary]
    for elem in tbllst[0][0]:  # iterating over table/tbody/tr elements
        if elem.tag == 'tr':
            # elem is 'tr' tag with a few children 'td'
            # iterating over children might look as follows:
            # for td in elem:
            #    print td.text
            # !!!Your code here!!!

            if elem.tag == '/tr':
                break

    return lister
コード例 #9
0
    def handle_noargs(self, **options):
        location_data = {}

        for feed in RSSFeed.events.all():
            if not feed.rss_url.startswith('http://www.dailyinfo.co.uk/'):
                continue

            
            feed_data = feedparser.parse(feed.rss_url)
            items = list(feed.rssitem_set.all())
            guids = set()
            
            for x_item in feed_data.entries:
                guid, last_modified = x_item.id, datetime(*x_item.date_parsed[:7])
                
                #print x_item.link
                #if x_item.link != 'http://www.dailyinfo.co.uk/events.php?colname=Lectures%2C+Seminars+and+Conferences&period=7&eventday=10&eventmonth=12&eventyear=2009#70276':
                #    continue
                
                print x_item.items()
                            
                for i in items:
                    if i.guid == guid:
                        item = i
                        break
                else:
                    item = RSSItem(guid=guid, last_modified=datetime(1900,1,1), feed=feed)
                    
                if True or item.last_modified < last_modified:
                    item.title = x_item.title.split(': ', 1)[1]
                    
                    try:
                        item.description = sanitise_html(Command.SUMMARY_RE.match(x_item.summary).groups(0)[0])
                    except:
                        item.description = sanitise_html(x_item.summary)
                        
                    item.link = x_item.link
                    item.last_modified = last_modified
                    item.dt_start = dateutil.parser.parse(x_item.xcal_dtstart)
                    item.dt_end = dateutil.parser.parse(x_item.xcal_dtend)
                    
                    item.location_url = x_item.xcal_url
                    
                    venue_id = int(Command.DAILY_INFO_VENUE_ID_RE.match(x_item.xcal_url).groups(0)[0])

                    try:
                        item.location_name, item.location_address, item.location_point = location_data[venue_id]
                    except KeyError:
                        try:
                            source, id = daily_info_ids[venue_id]
                            entity_type = iter(EntityType.objects.filter(source=source)).next()
                            entity = Entity.objects.get(**{str(entity_type.id_field): id})
                            item.location_entity = entity
                            item.location_point = entity.location
                            item.location_name = entity.title
                        except (KeyError, Entity.DoesNotExist):
                            venue_et = ES.parse(urllib.urlopen(item.location_url))
                            item.location_name = [e for e in venue_et.findall('.//div') if e.attrib.get('class')=='heading'][0].text.strip()
                            
                            try:
                                item.location_point = Point(float(x_item.geo_long),
                                                            float(x_item.geo_lat))
                                print x_item.geo_lat, x_item.geo_long
                            except AttributeError, ValueError:
                                for link in venue_et.findall('.//a'):
                                    match = Command.GOOGLE_MAPS_LINK_RE.match(link.attrib.get('href', ''))
                                    if match:
                                        item.location_point = self.postcode_to_point(match.groups(0)[0])
                                        break
                                else:
                                    item.location_point = None
                            
                            for para in venue_et.findall('.//p')[1:]:
                                item.location_address = (para.text or '').strip()
                                item.location_address = Command.WHITESPACE_RE.sub(' ', item.location_address)
                                if item.location_point:
                                    break
                                    
                                match = Command.POSTCODE_RE.search(item.location_address)
                                if not match:
                                    break
                                    
                                item.location_point = self.postcode_to_point(match.groups(0)[0])
                                print item.location_point
                                break
                            
                            location_data[venue_id] = item.location_name, item.location_address, item.location_point
                    
                    
                    item.save()

                
                guids.add(guid)
                
            for item in items:
            
                if not item.guid in guids:
                    item.delete()
コード例 #10
0
    def parse(self):
        print "scraper parses"
        #<tr class="depboard-dark">
        #<td class="time">16:51</td>
        #<td class="prognosis"><span class="rtLimit3">16:51</span></td></td>
        #<td class="product centeredText">
        # <a href="hta=sq&"><img class="product" src="/f" alt="U 6" /><br /><span class="nowrap">U 6</span></a></td>
        #<td class="timetable">
        # <strong><a href="http:tart=yes">Siebenhirten</a></strong><br />
        # <a href="http:tart=yes">Wien</a>16:51 - <a href="httyes">W..</td>
        #<td class="platform">2</td>

        
        html = ElementSoup.parse(self.br.response().read(), argIsHtml=True)
        #html = ElementSoup.parse("/home/jo/.xbmc/addons/x2/src/ds.html")

        #print self.br.response().read()
        #for t in html.findall(".//*"): print t.text
        # html is an Element instance
        stuff = {}
        def findxpath(xp):      #accesses current row object
            cc=''
            for f in row.findall(xp) or range(-1):
                cc += f.text or ''
                cc = re.sub("(?m)\\s+", " ", cc)
                cc = re.sub("^ ", "", cc)
                cc = re.sub(" $", "", cc)
            return cc
        
        for row,r in zip(html.findall(".//td[@class='time']/.."), range(3)):
            stuff[r] = {}
                      
            ti=findxpath("./td[@class='time']")
            pr=findxpath("./td[@class='prognosis']//*")
            if pr=="" or re.search("p.nktlich", pr) is not None:
                stuff[r][0]=ti
            else:
                stuff[r][0]=pr+"!"

            stuff[r][1]=findxpath("./td[@class='product centeredText']//*")
            stuff[r][2]=findxpath("./td[@class='timetable']//strong//*")
            stuff[r][3]=findxpath("./td[@class='platform']")
            

        for row in stuff:
            for col in stuff[row]:
                cc = stuff[row][col]
                cc = re.sub('(.{50}).*', '\\1', cc)
                if col==1:
                    cc=re.sub("Bus\s*", "", cc)
                    cc=re.sub("Tram\s*", "", cc)
                stuff[row][col]=cc

                    
        for r in range(3):
            if not r in stuff:
                stuff[r]= {}
            for c in range(4):
                if not c in stuff[r]:
                    stuff[r][c]= ""
        
        #print stuff
        print 'scraper parsing done'#, found %s..'%stuff[0][0]
        return stuff