Example #1
0
def load_trec(path, dialect = 0):
    # use default ``xml.sax.expatreader``
    all_entry = []
    def callback(entry):
        all_entry.append(entry)

    if dialect == 0:
        parser = TrecParser(callback)
    elif dialect == 1:
        parser = TrecParser2(callback)
    elif dialect == 2:
        parser = TrecParser3(callback)

    with open(path, encoding='utf-8', errors='ignore') as f:
        for buffer in f:
            try:
                parser.feed(buffer)
            except StopIteration:
                break

    index_corpus = {}
    for entry in all_entry:
        index_corpus[entry["DOCNO"]] = entry["TEXT"]

    return index_corpus
Example #2
0
 def text(self):
     if 'html_text' in self.object_dict:
         parser = HTMLTextParser()
         parser.feed(self.object_dict['html_text'])
         parser.close()
         return parser.get_formatted_text()
     else:
         return [self.object_dict['text']]
Example #3
0
def dehtml(text):
    try:
        parser = _DeHTMLParser()
        parser.feed(str(text))
        parser.close()
        return parser.text()
    except:
        from traceback import print_exc
        print_exc(file=sys.stderr)
        return text
Example #4
0
 def _parse_xml_string_to_dom(self, xml_string: str) -> ETree.Element:
     try:
         parser = ETree.XMLParser(target=ETree.TreeBuilder(), encoding=self.DEFAULT_ENCODING)
         parser.feed(xml_string)
         root = parser.close()
     except ETree.ParseError as e:
         raise RequestParserError(
             "Unable to parse request (%s), invalid XML received:\n%s" % (e, xml_string)
         )
     return root
Example #5
0
 def _parse_xml_string_to_dom(xml_string: str) -> ETree.Element:
     try:
         parser = ETree.XMLParser(target=ETree.TreeBuilder())
         parser.feed(xml_string)
         root = parser.close()
     except ETree.ParseError as e:
         raise ProtocolParserError(
             "Unable to parse request (%s), invalid XML received:\n%s" % (e, xml_string)
         ) from e
     return root
Example #6
0
def find_rss_link(site, parser):
    raw = requests.get(site).text  # get site
    parser.feed(raw)  # parse site
    feed_location = parser.feedLocation  # get rss file location
    if not feed_location:
        return None
    if feed_location[:4] != "http":
        file = feed_location
        base_site = site.split("/")[2]
        feed_location = "http://" + base_site + file
    return feed_location
    def html_to_text(self, html):
        if html is None:
            return None

    # Get rid of carriage returns.
        html = re.sub(r'\r|\n', '', html)
        # Get rid of non-breaking spaces.
        html = html.replace(u'\xa0', ' ')

        parser = ChunkingHtmlParser()
        parser.feed(html)
        return parser.get_text()
Example #8
0
def parseDirHtml(dirHtml, fileType):
    """Wrapper around HpwrenHTMLParser to pull out entries of given fileType

    Args:
        dirHtml (str): HTML page for directory listing
        fileType (str): File extension (e.g.: '.jpg')

    Returns:
        List of file names matching extension
    """
    parser = HpwrenHTMLParser(fileType)
    parser.feed(dirHtml)
    return parser.getTable()
Example #9
0
 def description(self):
     if 'html_notes' in self.object_dict:
         parser = HTMLTextParser()
         parser.feed(self.object_dict['html_notes'])
         parser.close()
         text = parser.get_formatted_text()
         if (len(text) > 0):
             return text
         else:
             return ""
     elif 'notes' in self.object_dict:
         return self.object_dict['notes']
     else:
         return ""
Example #10
0
def parse_revisions(filename):
    parser = xml.sax.make_parser(["xml.sax.IncrementalParser"])
    ready = deque()
    def deliver(x):
        ready.append(x)
    handler = DocumentHandler()
    handler.callback = deliver
    parser.setContentHandler(handler)
    with gzip.GzipFile(filename, "r") as raw_f:
        f = codecs.EncodedFile(raw_f, "utf8")
        for line in f:
            parser.feed(line)
            while ready:
                yield ready.popleft()
        parser.close()
        while ready:
            yield ready.popleft()
Example #11
0
def parse_description(description):
    """
    Some cinergi descriptions contain html.
    Parse out the data of the first element so
    we always return a regular string
    """
    if not description:
        return None

    class DescriptionHTMLParser(HTMLParser):
        contents = []

        def handle_data(self, data):
            self.contents.append(data)

    parser = DescriptionHTMLParser()
    parser.feed(description)
    return parser.contents[0]
Example #12
0
def load_trec_date_fbis(path):
    all_entry = []
    def callback(entry):
        all_entry.append(entry)


    parser = TrecParser5(callback)
    with open(path, encoding='utf-8', errors='ignore') as f:
        for buffer in f:
            try:
                parser.feed(buffer)
            except StopIteration:
                break

    index_corpus = {}
    for entry in all_entry:
        date = entry["DATE1"] if "DATE1" in entry else ""
        headline = entry["HEADLINE"] if "HEADLINE" in entry else ""
        index_corpus[entry["DOCNO"]] = (date, headline)

    return index_corpus
Example #13
0
def find_link(site, parser):
    raw = requests.get(site).text  # get site
    parser.feed(raw)  # parse site
    feed_location = parser.feedLocation  # get rss file location
    return feed_location
Example #14
0
    'lenovo ideapad s330 chromebook': ['lenovo chromebook s330'],
    'lenovo n21 chromebook': [
        'asi chromebook', 'crambo chromebook', 'jp sa couto chromebook',
        'rgs education chromebook', 'true idc chromebook',
        'videonet chromebook', 'consumer chromebook'
    ],
    'lenovo thinkpad 11e 3rd gen chromebook':
    ['thinkpad 11e chromebook 3rd gen (yoga/clamshell)'],
    'lenovo thinkpad 11e 4th gen chromebook': [
        'lenovo thinkpad 11e chromebook (4th gen)/lenovo thinkpad yoga 11e chromebook (4th gen)'
    ],
    'lenovo thinkpad 13': ['thinkpad 13 chromebook'],
    'poin2 chromebook 14': ['poin2 chromebook 11c'],
    'prowise chromebook eduline': ['viglen chromebook 11c'],
    'prowise chromebook entryline': ['prowise 11.6\" entry line chromebook'],
    'prowise chromebook proline': ['prowise proline chromebook'],
    'samsung chromebook - xe303': ['samsung chromebook'],
}
next_data_is_oem = False
next_data_is_td = False
data_is_date = False
auepage = requests.get(
    'https://support.google.com/chrome/a/answer/6220366?hl=en')
print('CROS_AUE_DATES = {')
parser = MyHTMLParser()
parser.feed(auepage.content.decode('utf-8'))
output_rows.sort(key=str.lower)
for row in output_rows:
    print(row)
print('}')
Example #15
0
from HTMLParser import HTMLParser
import dateutil.parser

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        if tag == "abbr":
            if attrs[0][1] == "time published":
                print dateutil.parser.parse(attrs[1][1]).replace(day=15, hour=0, minute=0, second=0, microsecond=0).strftime('%s')


parser = MyHTMLParser()
f = open('stefan2904/html/messages.html')
parser.feed(f.read())
def validateHTML(body):
    parser = StackExchangeSite()
    parser.feed(body)#This is not the most efficient method, I couldn't find a way to stop the tag search once an invalid HTML tag is found. That is, the body is always iterated until the last bit of information. However, it is certainly safe.
    return parser.flag
    parser = argparse.ArgumentParser(description='Get RSS list or Zabbix LLD format output from AWS Service Health Dashboard page.')
    parser.add_argument('-b', '--block', default="AP", help='set AWS region block(e.g.:NA or SA or EU or AP)')
    parser.add_argument('-i', '--interval', type=int, help='set interval time (seconds)')
    parser.add_argument('-m', '--send-mode', default='False', help='set True if you send AWS Service Health Dashboard status information. set False if you want to get lld format service list. (e.g.: True or False)')
    parser.add_argument('-p', '--zabbix-port', type=int, default=10051, help='set listening port number for Zabbix server')
    parser.add_argument('-z', '--zabbix-host', default='localhost', help='set listening IP address for Zabbix server')

    block_list = ["NA", "SA", "EU", "AP"]
    args = parser.parse_args()
    
    if args.block not in block_list:
        print "please set block name. :" + " or ".join(map(str, block_list))

    base_url = "http://status.aws.amazon.com/"
    socket.setdefaulttimeout(30) 
    htmldata = urllib2.urlopen(base_url)
    
    parser = AWSSHDParser(base_url, args.block, args.zabbix_host, args.zabbix_port)
    parser.feed(htmldata.read())

    if args.send_mode.upper() == "TRUE":
        for url in parser.url_list:
            get_rss_th = threading.Thread(target=parser.get_rss,name="get_rss_th", args=(url,))
            get_rss_th.start()

    if args.send_mode.upper() == "FALSE":
        print json.dumps(parser.lld_json)
        
    parser.close()
    htmldata.close()
Example #18
0
        self.result = []

    def handle_data(self, data):
        self.result.append(data)

    def get_result(self):
        return self.RESULT


# -------------------
# Main
# -------------------
if __name__ == '__main__':
    parser = MyHTMLParser()
    parser.feed(
        '<span class="timestamp"><b>18:09:35</b> </span>[2020-04-30T18:09:35 -0700] Starting caddy in /tmp/caddy-1000/8443 on port 8443'
    )  # you can use multiple 'xxx' 'ccc' '3333' into the feed

    print(
        "------------------------------------------------------------------------------------"
    )
    parser.feed(
        '<span class="timestamp"><b>01:06:08</b> </span>io/test_preview.py::testPreviewCorrectness[100-/home/jenkins/workspace/XCETest/buildOut/src/data/qa/csvSanity-csv-columnNameSpace.csv-30-simpleDataset-columnNameSpace.csv] PASSED [ 97%]'
    )
    print(
        "------------------------------------------------------------------------------------"
    )
    parser.feed(
        '<span class="timestamp"><b>01:06:17</b> </span>io/test_preview.py::testPreviewNotExists PASSED                          [ 98%]'
    )
    print(