def load_trec(path, dialect = 0): # use default ``xml.sax.expatreader`` all_entry = [] def callback(entry): all_entry.append(entry) if dialect == 0: parser = TrecParser(callback) elif dialect == 1: parser = TrecParser2(callback) elif dialect == 2: parser = TrecParser3(callback) with open(path, encoding='utf-8', errors='ignore') as f: for buffer in f: try: parser.feed(buffer) except StopIteration: break index_corpus = {} for entry in all_entry: index_corpus[entry["DOCNO"]] = entry["TEXT"] return index_corpus
def text(self): if 'html_text' in self.object_dict: parser = HTMLTextParser() parser.feed(self.object_dict['html_text']) parser.close() return parser.get_formatted_text() else: return [self.object_dict['text']]
def dehtml(text): try: parser = _DeHTMLParser() parser.feed(str(text)) parser.close() return parser.text() except: from traceback import print_exc print_exc(file=sys.stderr) return text
def _parse_xml_string_to_dom(self, xml_string: str) -> ETree.Element: try: parser = ETree.XMLParser(target=ETree.TreeBuilder(), encoding=self.DEFAULT_ENCODING) parser.feed(xml_string) root = parser.close() except ETree.ParseError as e: raise RequestParserError( "Unable to parse request (%s), invalid XML received:\n%s" % (e, xml_string) ) return root
def _parse_xml_string_to_dom(xml_string: str) -> ETree.Element: try: parser = ETree.XMLParser(target=ETree.TreeBuilder()) parser.feed(xml_string) root = parser.close() except ETree.ParseError as e: raise ProtocolParserError( "Unable to parse request (%s), invalid XML received:\n%s" % (e, xml_string) ) from e return root
def find_rss_link(site, parser): raw = requests.get(site).text # get site parser.feed(raw) # parse site feed_location = parser.feedLocation # get rss file location if not feed_location: return None if feed_location[:4] != "http": file = feed_location base_site = site.split("/")[2] feed_location = "http://" + base_site + file return feed_location
def html_to_text(self, html): if html is None: return None # Get rid of carriage returns. html = re.sub(r'\r|\n', '', html) # Get rid of non-breaking spaces. html = html.replace(u'\xa0', ' ') parser = ChunkingHtmlParser() parser.feed(html) return parser.get_text()
def parseDirHtml(dirHtml, fileType): """Wrapper around HpwrenHTMLParser to pull out entries of given fileType Args: dirHtml (str): HTML page for directory listing fileType (str): File extension (e.g.: '.jpg') Returns: List of file names matching extension """ parser = HpwrenHTMLParser(fileType) parser.feed(dirHtml) return parser.getTable()
def description(self): if 'html_notes' in self.object_dict: parser = HTMLTextParser() parser.feed(self.object_dict['html_notes']) parser.close() text = parser.get_formatted_text() if (len(text) > 0): return text else: return "" elif 'notes' in self.object_dict: return self.object_dict['notes'] else: return ""
def parse_revisions(filename): parser = xml.sax.make_parser(["xml.sax.IncrementalParser"]) ready = deque() def deliver(x): ready.append(x) handler = DocumentHandler() handler.callback = deliver parser.setContentHandler(handler) with gzip.GzipFile(filename, "r") as raw_f: f = codecs.EncodedFile(raw_f, "utf8") for line in f: parser.feed(line) while ready: yield ready.popleft() parser.close() while ready: yield ready.popleft()
def parse_description(description): """ Some cinergi descriptions contain html. Parse out the data of the first element so we always return a regular string """ if not description: return None class DescriptionHTMLParser(HTMLParser): contents = [] def handle_data(self, data): self.contents.append(data) parser = DescriptionHTMLParser() parser.feed(description) return parser.contents[0]
def load_trec_date_fbis(path): all_entry = [] def callback(entry): all_entry.append(entry) parser = TrecParser5(callback) with open(path, encoding='utf-8', errors='ignore') as f: for buffer in f: try: parser.feed(buffer) except StopIteration: break index_corpus = {} for entry in all_entry: date = entry["DATE1"] if "DATE1" in entry else "" headline = entry["HEADLINE"] if "HEADLINE" in entry else "" index_corpus[entry["DOCNO"]] = (date, headline) return index_corpus
def find_link(site, parser): raw = requests.get(site).text # get site parser.feed(raw) # parse site feed_location = parser.feedLocation # get rss file location return feed_location
'lenovo ideapad s330 chromebook': ['lenovo chromebook s330'], 'lenovo n21 chromebook': [ 'asi chromebook', 'crambo chromebook', 'jp sa couto chromebook', 'rgs education chromebook', 'true idc chromebook', 'videonet chromebook', 'consumer chromebook' ], 'lenovo thinkpad 11e 3rd gen chromebook': ['thinkpad 11e chromebook 3rd gen (yoga/clamshell)'], 'lenovo thinkpad 11e 4th gen chromebook': [ 'lenovo thinkpad 11e chromebook (4th gen)/lenovo thinkpad yoga 11e chromebook (4th gen)' ], 'lenovo thinkpad 13': ['thinkpad 13 chromebook'], 'poin2 chromebook 14': ['poin2 chromebook 11c'], 'prowise chromebook eduline': ['viglen chromebook 11c'], 'prowise chromebook entryline': ['prowise 11.6\" entry line chromebook'], 'prowise chromebook proline': ['prowise proline chromebook'], 'samsung chromebook - xe303': ['samsung chromebook'], } next_data_is_oem = False next_data_is_td = False data_is_date = False auepage = requests.get( 'https://support.google.com/chrome/a/answer/6220366?hl=en') print('CROS_AUE_DATES = {') parser = MyHTMLParser() parser.feed(auepage.content.decode('utf-8')) output_rows.sort(key=str.lower) for row in output_rows: print(row) print('}')
from HTMLParser import HTMLParser import dateutil.parser class MyHTMLParser(HTMLParser): def handle_starttag(self, tag, attrs): if tag == "abbr": if attrs[0][1] == "time published": print dateutil.parser.parse(attrs[1][1]).replace(day=15, hour=0, minute=0, second=0, microsecond=0).strftime('%s') parser = MyHTMLParser() f = open('stefan2904/html/messages.html') parser.feed(f.read())
def validateHTML(body): parser = StackExchangeSite() parser.feed(body)#This is not the most efficient method, I couldn't find a way to stop the tag search once an invalid HTML tag is found. That is, the body is always iterated until the last bit of information. However, it is certainly safe. return parser.flag
parser = argparse.ArgumentParser(description='Get RSS list or Zabbix LLD format output from AWS Service Health Dashboard page.') parser.add_argument('-b', '--block', default="AP", help='set AWS region block(e.g.:NA or SA or EU or AP)') parser.add_argument('-i', '--interval', type=int, help='set interval time (seconds)') parser.add_argument('-m', '--send-mode', default='False', help='set True if you send AWS Service Health Dashboard status information. set False if you want to get lld format service list. (e.g.: True or False)') parser.add_argument('-p', '--zabbix-port', type=int, default=10051, help='set listening port number for Zabbix server') parser.add_argument('-z', '--zabbix-host', default='localhost', help='set listening IP address for Zabbix server') block_list = ["NA", "SA", "EU", "AP"] args = parser.parse_args() if args.block not in block_list: print "please set block name. :" + " or ".join(map(str, block_list)) base_url = "http://status.aws.amazon.com/" socket.setdefaulttimeout(30) htmldata = urllib2.urlopen(base_url) parser = AWSSHDParser(base_url, args.block, args.zabbix_host, args.zabbix_port) parser.feed(htmldata.read()) if args.send_mode.upper() == "TRUE": for url in parser.url_list: get_rss_th = threading.Thread(target=parser.get_rss,name="get_rss_th", args=(url,)) get_rss_th.start() if args.send_mode.upper() == "FALSE": print json.dumps(parser.lld_json) parser.close() htmldata.close()
self.result = [] def handle_data(self, data): self.result.append(data) def get_result(self): return self.RESULT # ------------------- # Main # ------------------- if __name__ == '__main__': parser = MyHTMLParser() parser.feed( '<span class="timestamp"><b>18:09:35</b> </span>[2020-04-30T18:09:35 -0700] Starting caddy in /tmp/caddy-1000/8443 on port 8443' ) # you can use multiple 'xxx' 'ccc' '3333' into the feed print( "------------------------------------------------------------------------------------" ) parser.feed( '<span class="timestamp"><b>01:06:08</b> </span>io/test_preview.py::testPreviewCorrectness[100-/home/jenkins/workspace/XCETest/buildOut/src/data/qa/csvSanity-csv-columnNameSpace.csv-30-simpleDataset-columnNameSpace.csv] PASSED [ 97%]' ) print( "------------------------------------------------------------------------------------" ) parser.feed( '<span class="timestamp"><b>01:06:17</b> </span>io/test_preview.py::testPreviewNotExists PASSED [ 98%]' ) print(