def parse(fobj): ''' Parse HTML file and extract targets data ''' root = ES.parse(fobj) data = [] tbllst = root.findall('.//table') for elem in tbllst[0][0]: # iterating over table/tbody/tr elements if elem.tag == 'tr': counter = 1 temp=[] for td in elem: if counter == 1: if td.text == "Target name": counter += 1 continue temp.append(td.text) counter += 1 elif counter == 3: temp.append(td.text) counter += 1 elif counter == 10: if td.text != None: temp.append(td.text) counter += 1 else: counter += 1 if len(temp) == 3: data.append(temp) data.sort(key=lambda item: (item[2], item[0]), reverse=False) return data
def parse(fobj): ''' Parse HTML file and extract targets data ''' root = ES.parse(fobj) data = [] tbllst = root.findall('.//table') for elem in tbllst[0][0]: # iterating over table/tbody/tr elements if elem.tag == 'tr': counter = 1 temp = [] for td in elem: if counter == 1: if td.text == "Target name": counter += 1 continue temp.append(td.text) counter += 1 elif counter == 3: temp.append(td.text) counter += 1 elif counter == 10: if td.text != None: temp.append(td.text) counter += 1 else: counter += 1 if len(temp) == 3: data.append(temp) data.sort(key=lambda item: (item[2], item[0]), reverse=False) return data
def parse(fobj): ''' Parse HTML file and extract targets data ''' root = ES.parse(fobj) tbllst = root.findall('.//table') for elem in tbllst[0][0]: # iterating over table/tbody/tr elements if elem.tag == 'tr':
def get_vcard(uri, lat, long): try: return vcards[uri] except KeyError: vcard = vcards[uri] = vCard.objects.get_or_create(uri=uri)[0] vcard.location = Point(long, lat, srid=4326) venue_et = ES.parse(urllib.urlopen(uri)) vcard.name = [e for e in venue_et.findall('.//div') if e.attrib.get('class')=='heading'][0].text.strip() vcard.save() return vcard
def read(cls, tag_str, language): tag_str = tag_str.lstrip('*: ') t = ElementSoup.parse(StringIO(tag_str), encoding='utf8') if t.tag == 'html': # Parsing inconsistency with included ampersands t = t[0] d = dict(t.items()) d['type'] = t.tag if t.text: d['description'] = t.text d['subtype'] = heuristics.determine_tagtype( d['name'] + ' ' + d['description'], language)[1] return cls.sanitize(d)
def get_from_link(input_str): input_str = input_str.strip() if (input_str.count('\n') <= 1 and input_str.startswith('http://') and 'action=edit' in input_str and 'wikivoyage' in input_str): input_str = fake_agent_readurl(input_str) t = ElementSoup.parse(StringIO(input_str)) if sys.version_info[:2] < (2, 7): # Xpath too stupid for bracket syntax, # fortunately there seems to be only one input_str = t.find(".//textarea").text else: input_str = t.find(".//textarea[@id='wpTextbox1']").text return html_decode(input_str) return input_str
def parse(fobj): ''' Parse HTML file and extract targets data ''' d = list() root = ES.parse(fobj) tbllst = root.findall('.//table') for elem in tbllst[0][0]: # iterating over table/tbody/tr elements if elem.tag == 'tr': lst = list() # elem is 'tr' tag with a few children 'td' # iterating over children might look as follows: for td in elem: lst.append(td.text) #print lst d.append(lst) return d
def parse(fobj): ''' Parse HTML file and extract targets data ''' root = ES.parse(fobj) tbllst = root.findall('.//table') dictionary = {} lister = [dictionary] for elem in tbllst[0][0]: # iterating over table/tbody/tr elements if elem.tag == 'tr': # elem is 'tr' tag with a few children 'td' # iterating over children might look as follows: # for td in elem: # print td.text # !!!Your code here!!! if elem.tag == '/tr': break return lister
def handle_noargs(self, **options): location_data = {} for feed in RSSFeed.events.all(): if not feed.rss_url.startswith('http://www.dailyinfo.co.uk/'): continue feed_data = feedparser.parse(feed.rss_url) items = list(feed.rssitem_set.all()) guids = set() for x_item in feed_data.entries: guid, last_modified = x_item.id, datetime(*x_item.date_parsed[:7]) #print x_item.link #if x_item.link != 'http://www.dailyinfo.co.uk/events.php?colname=Lectures%2C+Seminars+and+Conferences&period=7&eventday=10&eventmonth=12&eventyear=2009#70276': # continue print x_item.items() for i in items: if i.guid == guid: item = i break else: item = RSSItem(guid=guid, last_modified=datetime(1900,1,1), feed=feed) if True or item.last_modified < last_modified: item.title = x_item.title.split(': ', 1)[1] try: item.description = sanitise_html(Command.SUMMARY_RE.match(x_item.summary).groups(0)[0]) except: item.description = sanitise_html(x_item.summary) item.link = x_item.link item.last_modified = last_modified item.dt_start = dateutil.parser.parse(x_item.xcal_dtstart) item.dt_end = dateutil.parser.parse(x_item.xcal_dtend) item.location_url = x_item.xcal_url venue_id = int(Command.DAILY_INFO_VENUE_ID_RE.match(x_item.xcal_url).groups(0)[0]) try: item.location_name, item.location_address, item.location_point = location_data[venue_id] except KeyError: try: source, id = daily_info_ids[venue_id] entity_type = iter(EntityType.objects.filter(source=source)).next() entity = Entity.objects.get(**{str(entity_type.id_field): id}) item.location_entity = entity item.location_point = entity.location item.location_name = entity.title except (KeyError, Entity.DoesNotExist): venue_et = ES.parse(urllib.urlopen(item.location_url)) item.location_name = [e for e in venue_et.findall('.//div') if e.attrib.get('class')=='heading'][0].text.strip() try: item.location_point = Point(float(x_item.geo_long), float(x_item.geo_lat)) print x_item.geo_lat, x_item.geo_long except AttributeError, ValueError: for link in venue_et.findall('.//a'): match = Command.GOOGLE_MAPS_LINK_RE.match(link.attrib.get('href', '')) if match: item.location_point = self.postcode_to_point(match.groups(0)[0]) break else: item.location_point = None for para in venue_et.findall('.//p')[1:]: item.location_address = (para.text or '').strip() item.location_address = Command.WHITESPACE_RE.sub(' ', item.location_address) if item.location_point: break match = Command.POSTCODE_RE.search(item.location_address) if not match: break item.location_point = self.postcode_to_point(match.groups(0)[0]) print item.location_point break location_data[venue_id] = item.location_name, item.location_address, item.location_point item.save() guids.add(guid) for item in items: if not item.guid in guids: item.delete()
def parse(self): print "scraper parses" #<tr class="depboard-dark"> #<td class="time">16:51</td> #<td class="prognosis"><span class="rtLimit3">16:51</span></td></td> #<td class="product centeredText"> # <a href="hta=sq&"><img class="product" src="/f" alt="U 6" /><br /><span class="nowrap">U 6</span></a></td> #<td class="timetable"> # <strong><a href="http:tart=yes">Siebenhirten</a></strong><br /> # <a href="http:tart=yes">Wien</a>16:51 - <a href="httyes">W..</td> #<td class="platform">2</td> html = ElementSoup.parse(self.br.response().read(), argIsHtml=True) #html = ElementSoup.parse("/home/jo/.xbmc/addons/x2/src/ds.html") #print self.br.response().read() #for t in html.findall(".//*"): print t.text # html is an Element instance stuff = {} def findxpath(xp): #accesses current row object cc='' for f in row.findall(xp) or range(-1): cc += f.text or '' cc = re.sub("(?m)\\s+", " ", cc) cc = re.sub("^ ", "", cc) cc = re.sub(" $", "", cc) return cc for row,r in zip(html.findall(".//td[@class='time']/.."), range(3)): stuff[r] = {} ti=findxpath("./td[@class='time']") pr=findxpath("./td[@class='prognosis']//*") if pr=="" or re.search("p.nktlich", pr) is not None: stuff[r][0]=ti else: stuff[r][0]=pr+"!" stuff[r][1]=findxpath("./td[@class='product centeredText']//*") stuff[r][2]=findxpath("./td[@class='timetable']//strong//*") stuff[r][3]=findxpath("./td[@class='platform']") for row in stuff: for col in stuff[row]: cc = stuff[row][col] cc = re.sub('(.{50}).*', '\\1', cc) if col==1: cc=re.sub("Bus\s*", "", cc) cc=re.sub("Tram\s*", "", cc) stuff[row][col]=cc for r in range(3): if not r in stuff: stuff[r]= {} for c in range(4): if not c in stuff[r]: stuff[r][c]= "" #print stuff print 'scraper parsing done'#, found %s..'%stuff[0][0] return stuff