def extract_raw_strokes(xml_path, removal_threshold=100.0): root = parse(xml_path).getroot() stroke_set = root.find("StrokeSet") result = [] for stroke in stroke_set.iter("Stroke"): xys = [] ts = [] first = stroke.find("Point") p_xy, p_t = make_tuple(first) for point in stroke.iter("Point"): xy, t = make_tuple(point) if numpy.linalg.norm(xy - p_xy) > removal_threshold or p_t == t: # maybe noise continue p_xy = xy p_t = t xys.append(xy) ts.append(t) if len(ts) < 2: # unable to interpolate continue result.append((numpy.array(ts), numpy.array(xys).transpose())) return result
for element in xml: if element.tag == "DOCUMENT": doc = document.Document(sla_parent=self) success = doc.fromxml(element) if success: obj.document = doc return True, obj else: return False, obj success = False if isinstance(xml, lxml.etree._Element): success, self = read_xml(self, xml) else: if os.path.exists(os.path.realpath(xml)): xml = lxml.parse(filepath).getroot() success, self = read_xml(self, xml) else: raise TypeError("fromxml requires lxml.etree._Element.") return success # vim:set shiftwidth=4 softtabstop=4 spl=en:
def parse_options_data(table): rows = table.findall('.//tr') header = _unpack(rows[0], kind='th') data = [_unpack(r) for r in rows[1:]] return TextParser(data, names=header).get_chunk() if __name__ == '__main__': #parsed = parse('http://finance.yahoo.com/q/op?s=AAPL+Options') #parsed = parse('http://www-rohan.sdsu.edu/~gawron') #parsed = parse('http://www.lajollasurf.org/cgi-bin/plottide.pl') url = 'http://www.caferouge.com/menus/mainmenu' parsed = parse(url) #id="ctl00_ctl00_Content_MCC_RadDatePicker_calendar_Top" doc = parsed.getroot() links = doc.findall('.//a') links_sub_list = links[15:20] lnk = links_sub_list[0] sample_url = lnk.get('href') sample_display_text = lnk.text_content() tables = doc.findall('.//table') ## Look at tables, find a table of interest
def extract_raw_text(xml_path): root = parse(xml_path).getroot() transcription = root.find("Transcription") return [t.attrib["text"] for t in transcription.iter("TextLine")]