async def scrape(self): """ loads initial xml map, puts elements in queue, assigns workers, and initializes the scrape """ start = datetime.datetime.now() # load initial xml init_xml = await self.load_init_xml('nyTimesSiteMap/sitemap.xml.gz', self.loop) # parse xml_file_tree parser = XMLPullParser(['start', 'end']) parser.feed(init_xml) for event, element in parser.read_events(): if event == 'start' and 'sitemap' in element.text: self.xml_queue.put_nowait(element.text) async with aiohttp.ClientSession() as session: tasks = [ self.handle_task(i, session) for i in range(self.num_workers) ] await asyncio.gather(*tasks) stop = datetime.datetime.now() time_past = stop - start minutes = (time_past.seconds % 3600) // 60 seconds = time_past.seconds % 60 print(f'visited {self.pages_scraped} websites in {minutes} minutes ' f'and {seconds} seconds')
def parse( xml_text: Union[bytes, Iterator[bytes]], path: Tuple = ('feed', 'entry') ) -> Iterator[Union[dict, str]]: """ lazy parser for large xml to extract a list of embedded xml elements :param xml_text: xml input, bytes or an iterator of bytes :param path: a tuple specifying the path of the xml elements to extract :return an iterator of dicts of the embedded xml elements """ # convert xml_text to bytes iterator if it's of bytes type if type(xml_text) is bytes: xml_chunks = (xml_text[i:i + _chunk_size] for i in range(0, len(xml_text), _chunk_size)) else: xml_chunks = xml_text path: list = list(path) tag_stack: [str] = [] xml_parser = XMLPullParser(events=('start', 'end')) handle_partial_xml = _get_partial_xml_handler(path) for chunk in xml_chunks: xml_parser.feed(chunk) for element in handle_partial_xml(xml_parser): yield _elem_to_dict_or_str(element)
def parse(self, xml_bytes): assert xml_bytes assert isinstance(xml_bytes, bytes) parser = XMLPullParser(['start', 'end', 'start-ns', 'end-ns']) # ignore 'comment' & 'pi' with closing(parser) as parser: parser.feed(xml_bytes) return self._parser_read_events(parser)
def read_feed(self): feed = None is_feed_header = True description = '' title = '' link = '' language = '' author = '' published_on = '' guid = '' try: data = self.read() parser = XMLPullParser(['start', 'end']) parser.feed(data) for event, elem in list(parser.read_events()): if event == 'start': local_part = elem.tag if local_part == 'item': if is_feed_header == True: is_feed_header = False feed = Feed(title, link, description, language, published_on) elif local_part == 'title': title = elem.text elif local_part == 'description': description = elem.text elif local_part == 'link': link = elem.text elif local_part == 'guid': guid = elem.text elif local_part == 'language': language = elem.text elif local_part == 'author': author = elem.text elif local_part == 'published_on': published_on = elem.text elif event == 'end': if elem.get_tag() == 'item': feed_message = FeedMessage(title, link, description, author, guid) feed.entries.append(feed_message) continue except ParseError as pe: print(str(pe.code) + ": " + pe.get_reason()) return feed
class RequestParser: def __init__(self): self._parser = XMLPullParser(['start', 'end']) self._root_element = None def has_ended(self, data: bytes) -> bool: self._parser.feed(data) for event, element in self._parser.read_events(): if event == 'start' and self._root_element is None: self._root_element = element elif event == 'end' and self._root_element is not None: if element.tag == self._root_element.tag: return True return False
def retrieve_namespaces(self): if version_info < (3, 4): raise NotImplementedError('Python 3.4 or higher is required.') from xml.etree.ElementTree import XMLPullParser ns = {} parser = XMLPullParser(['start-ns']) parser.feed(self.text) for e in parser._events_queue: if e[0] != 'start-ns': continue prefix, uri = e[1] if uri not in ns.keys(): ns[uri] = prefix return dict((v, k) for k, v in ns.items())
def parse(self, xml_bytes): """ Given *xml_bytes* return the data as a tree of Python objects. Args: xml_bytes (bytes): Byte string of XML data. Returns: results """ assert xml_bytes assert isinstance(xml_bytes, bytes) parser = XMLPullParser(['start', 'end', 'start-ns', 'end-ns']) # ignore 'comment' & 'pi' with closing(parser) as parser: parser.feed(xml_bytes) return self._parser_read_events(parser)
async def handle_task(self, task_id, session): """ async worker. Gets xml map file finds recipe urls, fetches html, and sends to parser """ while not self.xml_queue.empty(): xml_url = await self.xml_queue.get() print(f'worker {task_id}: fetching file {xml_url}') # get xml with recipes recipe_xml = await self.load_xml_gz(xml_url, session) recipe_xml_parser = XMLPullParser(['start', 'end']) recipe_xml_parser.feed(recipe_xml) for event, element in recipe_xml_parser.read_events(): if 'loc' in element.tag and event == 'start': url = element.text if '/recipes/' in url: html = await self.fetch_url(url, session) await self.parse(html, url) self.pages_scraped += 1 if not self.pages_scraped % 50: print(f'{self.pages_scraped} pages scraped', ' so far')
# coding = utf-8 from xml.etree.ElementTree import XMLPullParser events = ("start", "end", "start-ns", "end-ns") parser = XMLPullParser(events=events) fd = open('books.xml', 'r') xml_data = fd.read() parser.feed(xml_data) # 转换成列表操作 re_events = list(parser.read_events()) # 构造xml的root root_element = re_events[0][1] # 从根节点偏离element树 def list_tree(element, depth): print('\t' * depth, element.tag, ":", element.text if element.text.strip() != '' else '') children_elements = element.getchildren() if children_elements: for e_ in children_elements: list_tree(e_, depth + 1) list_tree(root_element, 0)
client = MyMongoClient() collection = client.get_collection('UsersLowRep') UsersFilePath = './Data/Users.xml' startId = int(sys.argv[1]) if len(sys.argv) > 1 else 0 dbThreshold = int(sys.argv[2]) if len(sys.argv) > 2 else None nextSwitchId = startId + dbThreshold if dbThreshold is not None else None reputationThreshold = 100 viewThreshold = 100 parser = XMLPullParser(events=['end']) with open(file=UsersFilePath) as f: Id = 0 counter = 0 rep = 0 for line in f: parser.feed(line) for event, elem in parser.read_events(): if elem.tag == 'row': Id = int(elem.get('Id')) if Id < startId: continue # rep += int(elem.get('Reputation')) # counter += 1 reputation = int(elem.get('Reputation')) if elem.get('Views') is not None: viewCount = int(elem.get('Views')) if elem.get('UpVotes') is not None: upCount = int(elem.get('UpVotes')) if elem.get('DownVotes') is not None: downCount = int(elem.get('DownVotes')) if reputation <= reputationThreshold and viewCount <= viewThreshold:
postIdset = set() commentCount = dict() for d in res: postIdset.add(d['Id']) commentCount[d['Id']] = d['CommentCount'] comments = {} parser = XMLPullParser(events=['end']) with open(file=CommentFilePath) as f: counter = 0 #Things to fix, something wrong with the parser, we need to put line contraints on it for line in f: # if counter <= 1: # parser.feed(line) counter += 1 if counter % 1000000 == 0: print('At line %d' % counter) parser.feed('</comments>') parser.close() parser = XMLPullParser(events=['end']) parser.feed('<comments>') # if counter <= 56000000: # continue # if counter > 56200000: # break if entriesNum == 0: break parser.feed(line) for event, elem in parser.read_events(): if (elem.tag == 'row'): postId = int(elem.get('PostId')) if postId in postIdset: if postId in comments.keys():
class Device(object): """Handles a Raven or Emu serial device.""" def __init__(self, device): """Open the Raven or Emu and prepares for parsing.""" from serial import Serial self._dev = Serial(device, 115200, timeout=0) self._sanitizer = re.compile(r'[^\sa-zA-Z0-9<>/_-]') self._init_parser() def _init_parser(self): """Reset the XML parser and primes it with a document tag.""" self._parser = XMLPullParser(['start', 'end']) # Add a junk root tag so we constantly get data self._parser.feed("<HomeAssistant>\n") # Store the root tag so we can clear it to avoid amassing memory for (_, elem) in self._parser.read_events(): self._root = elem # Reset data self._data = [{}] def update(self): """Pull and parse new data from the serial device.""" try: serial_data = self._dev.read(1024).decode() self._parser.feed(self._sanitizer.sub('', serial_data)) for (event, elem) in self._parser.read_events(): if event == 'start': self._data.append({}) else: data = self._data.pop() data['text'] = elem.text self._data[-1][elem.tag] = data if len(self._data) == 1: # Clear the element from root self._root.remove(elem) except ParseError: self._init_parser() def get(self, field): """Return the data accumulated for a given XML tag.""" return self._data[0][field] def query_instantaneous_demand(self): """Request updates on instantaneous demand.""" self._dev.write(b"<Command>\n" + b" <Name>get_instantaneous_demand</name>\n" + b" <Refresh>Y</Refresh>\n" b"</Command>\n") self._dev.flush() def query_summation_delivered(self): """Request updates on the various summations.""" self._dev.write(b"<Command>\n" + b" <Name>get_current_summation_delivered</name>\n" + b" <Refresh>Y</Refresh>\n" + b"</Command>\n") self._dev.flush() def query_current_price(self): """Request updates on pricing.""" self._dev.write(b"<Command>\n" + b" <Name>get_current_price</Name>\n" + b" <Refresh>Y</Refresh>\n" + b"</Command>\n") self._dev.flush()
PostsFilePath = './Data/Posts.xml' startId = int(sys.argv[1]) if len(sys.argv) > 1 else 0 dbThreshold = int(sys.argv[2]) if len(sys.argv) > 2 else None nextSwitchId = startId + dbThreshold if dbThreshold is not None else None startLine = int(sys.argv[3]) if len(sys.argv) > 3 else None scoreThreshold = -1 parser = XMLPullParser(events=['end']) with open(file=PostsFilePath) as f: Id = 0 counter = 0 for line in f: counter += 1 if counter % 1000000 == 0: parser.feed('</posts>') parser.close() parser = XMLPullParser(events=['end']) parser.feed('<posts>') if startLine is None: parser.feed(line) else: if counter <= 2 or counter >= startLine: parser.feed(line) else: continue for event, elem in parser.read_events(): if (elem.tag == 'row'): Id = int(elem.get('Id')) if Id < startId: continue
from xml.etree.ElementTree import XMLPullParser from MongodbClient import MyMongoClient import sys client = MyMongoClient() collection = client.get_collection('PostsWithOffensiveVote') VotesFilePath = './Data/Votes.xml' parser = XMLPullParser(events=['end']) with open(file=VotesFilePath) as f: counter = 0 for line in f: counter += 1 if counter % 1000000 == 0: parser.feed('</votes>') parser.close() parser = XMLPullParser(events=['end']) parser.feed('<votes>') parser.feed(line) for event, elem in parser.read_events(): if (elem.tag == 'row'): voteType = int(elem.get('VoteTypeId')) if voteType == 4: Id = int(elem.get('PostId')) print('Inserting postid %d' % Id) collection.insert_one({'Id': Id, 'VoteTypeId': voteType}) f.close()