コード例 #1
0
ファイル: trec.py プロジェクト: clover3/Chair
def load_trec(path, dialect = 0):
    # use default ``xml.sax.expatreader``
    all_entry = []
    def callback(entry):
        all_entry.append(entry)

    if dialect == 0:
        parser = TrecParser(callback)
    elif dialect == 1:
        parser = TrecParser2(callback)
    elif dialect == 2:
        parser = TrecParser3(callback)

    with open(path, encoding='utf-8', errors='ignore') as f:
        for buffer in f:
            try:
                parser.feed(buffer)
            except StopIteration:
                break

    index_corpus = {}
    for entry in all_entry:
        index_corpus[entry["DOCNO"]] = entry["TEXT"]

    return index_corpus
コード例 #2
0
ファイル: models.py プロジェクト: aarongut/cmdasana
 def text(self):
     if 'html_text' in self.object_dict:
         parser = HTMLTextParser()
         parser.feed(self.object_dict['html_text'])
         parser.close()
         return parser.get_formatted_text()
     else:
         return [self.object_dict['text']]
コード例 #3
0
def dehtml(text):
    try:
        parser = _DeHTMLParser()
        parser.feed(str(text))
        parser.close()
        return parser.text()
    except:
        from traceback import print_exc
        print_exc(file=sys.stderr)
        return text
コード例 #4
0
 def _parse_xml_string_to_dom(self, xml_string: str) -> ETree.Element:
     try:
         parser = ETree.XMLParser(target=ETree.TreeBuilder(), encoding=self.DEFAULT_ENCODING)
         parser.feed(xml_string)
         root = parser.close()
     except ETree.ParseError as e:
         raise RequestParserError(
             "Unable to parse request (%s), invalid XML received:\n%s" % (e, xml_string)
         )
     return root
コード例 #5
0
ファイル: parser.py プロジェクト: localstack/localstack
 def _parse_xml_string_to_dom(xml_string: str) -> ETree.Element:
     try:
         parser = ETree.XMLParser(target=ETree.TreeBuilder())
         parser.feed(xml_string)
         root = parser.close()
     except ETree.ParseError as e:
         raise ProtocolParserError(
             "Unable to parse request (%s), invalid XML received:\n%s" % (e, xml_string)
         ) from e
     return root
コード例 #6
0
ファイル: parser.py プロジェクト: giacomocaironi/RssReader
def find_rss_link(site, parser):
    raw = requests.get(site).text  # get site
    parser.feed(raw)  # parse site
    feed_location = parser.feedLocation  # get rss file location
    if not feed_location:
        return None
    if feed_location[:4] != "http":
        file = feed_location
        base_site = site.split("/")[2]
        feed_location = "http://" + base_site + file
    return feed_location
コード例 #7
0
    def html_to_text(self, html):
        if html is None:
            return None

    # Get rid of carriage returns.
        html = re.sub(r'\r|\n', '', html)
        # Get rid of non-breaking spaces.
        html = html.replace(u'\xa0', ' ')

        parser = ChunkingHtmlParser()
        parser.feed(html)
        return parser.get_text()
コード例 #8
0
def parseDirHtml(dirHtml, fileType):
    """Wrapper around HpwrenHTMLParser to pull out entries of given fileType

    Args:
        dirHtml (str): HTML page for directory listing
        fileType (str): File extension (e.g.: '.jpg')

    Returns:
        List of file names matching extension
    """
    parser = HpwrenHTMLParser(fileType)
    parser.feed(dirHtml)
    return parser.getTable()
コード例 #9
0
ファイル: models.py プロジェクト: aarongut/cmdasana
 def description(self):
     if 'html_notes' in self.object_dict:
         parser = HTMLTextParser()
         parser.feed(self.object_dict['html_notes'])
         parser.close()
         text = parser.get_formatted_text()
         if (len(text) > 0):
             return text
         else:
             return ""
     elif 'notes' in self.object_dict:
         return self.object_dict['notes']
     else:
         return ""
コード例 #10
0
ファイル: dumpscan.py プロジェクト: steinarvk/wstats
def parse_revisions(filename):
    parser = xml.sax.make_parser(["xml.sax.IncrementalParser"])
    ready = deque()
    def deliver(x):
        ready.append(x)
    handler = DocumentHandler()
    handler.callback = deliver
    parser.setContentHandler(handler)
    with gzip.GzipFile(filename, "r") as raw_f:
        f = codecs.EncodedFile(raw_f, "utf8")
        for line in f:
            parser.feed(line)
            while ready:
                yield ready.popleft()
        parser.close()
        while ready:
            yield ready.popleft()
コード例 #11
0
def parse_description(description):
    """
    Some cinergi descriptions contain html.
    Parse out the data of the first element so
    we always return a regular string
    """
    if not description:
        return None

    class DescriptionHTMLParser(HTMLParser):
        contents = []

        def handle_data(self, data):
            self.contents.append(data)

    parser = DescriptionHTMLParser()
    parser.feed(description)
    return parser.contents[0]
コード例 #12
0
ファイル: trec.py プロジェクト: clover3/Chair
def load_trec_date_fbis(path):
    all_entry = []
    def callback(entry):
        all_entry.append(entry)


    parser = TrecParser5(callback)
    with open(path, encoding='utf-8', errors='ignore') as f:
        for buffer in f:
            try:
                parser.feed(buffer)
            except StopIteration:
                break

    index_corpus = {}
    for entry in all_entry:
        date = entry["DATE1"] if "DATE1" in entry else ""
        headline = entry["HEADLINE"] if "HEADLINE" in entry else ""
        index_corpus[entry["DOCNO"]] = (date, headline)

    return index_corpus
コード例 #13
0
def find_link(site, parser):
    raw = requests.get(site).text  # get site
    parser.feed(raw)  # parse site
    feed_location = parser.feedLocation  # get rss file location
    return feed_location
コード例 #14
0
ファイル: parse-aue.py プロジェクト: ssss-38438-org/GAM
    'lenovo ideapad s330 chromebook': ['lenovo chromebook s330'],
    'lenovo n21 chromebook': [
        'asi chromebook', 'crambo chromebook', 'jp sa couto chromebook',
        'rgs education chromebook', 'true idc chromebook',
        'videonet chromebook', 'consumer chromebook'
    ],
    'lenovo thinkpad 11e 3rd gen chromebook':
    ['thinkpad 11e chromebook 3rd gen (yoga/clamshell)'],
    'lenovo thinkpad 11e 4th gen chromebook': [
        'lenovo thinkpad 11e chromebook (4th gen)/lenovo thinkpad yoga 11e chromebook (4th gen)'
    ],
    'lenovo thinkpad 13': ['thinkpad 13 chromebook'],
    'poin2 chromebook 14': ['poin2 chromebook 11c'],
    'prowise chromebook eduline': ['viglen chromebook 11c'],
    'prowise chromebook entryline': ['prowise 11.6\" entry line chromebook'],
    'prowise chromebook proline': ['prowise proline chromebook'],
    'samsung chromebook - xe303': ['samsung chromebook'],
}
next_data_is_oem = False
next_data_is_td = False
data_is_date = False
auepage = requests.get(
    'https://support.google.com/chrome/a/answer/6220366?hl=en')
print('CROS_AUE_DATES = {')
parser = MyHTMLParser()
parser.feed(auepage.content.decode('utf-8'))
output_rows.sort(key=str.lower)
for row in output_rows:
    print(row)
print('}')
コード例 #15
0
from HTMLParser import HTMLParser
import dateutil.parser

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        if tag == "abbr":
            if attrs[0][1] == "time published":
                print dateutil.parser.parse(attrs[1][1]).replace(day=15, hour=0, minute=0, second=0, microsecond=0).strftime('%s')


parser = MyHTMLParser()
f = open('stefan2904/html/messages.html')
parser.feed(f.read())
コード例 #16
0
def validateHTML(body):
    parser = StackExchangeSite()
    parser.feed(body)#This is not the most efficient method, I couldn't find a way to stop the tag search once an invalid HTML tag is found. That is, the body is always iterated until the last bit of information. However, it is certainly safe.
    return parser.flag
    parser = argparse.ArgumentParser(description='Get RSS list or Zabbix LLD format output from AWS Service Health Dashboard page.')
    parser.add_argument('-b', '--block', default="AP", help='set AWS region block(e.g.:NA or SA or EU or AP)')
    parser.add_argument('-i', '--interval', type=int, help='set interval time (seconds)')
    parser.add_argument('-m', '--send-mode', default='False', help='set True if you send AWS Service Health Dashboard status information. set False if you want to get lld format service list. (e.g.: True or False)')
    parser.add_argument('-p', '--zabbix-port', type=int, default=10051, help='set listening port number for Zabbix server')
    parser.add_argument('-z', '--zabbix-host', default='localhost', help='set listening IP address for Zabbix server')

    block_list = ["NA", "SA", "EU", "AP"]
    args = parser.parse_args()
    
    if args.block not in block_list:
        print "please set block name. :" + " or ".join(map(str, block_list))

    base_url = "http://status.aws.amazon.com/"
    socket.setdefaulttimeout(30) 
    htmldata = urllib2.urlopen(base_url)
    
    parser = AWSSHDParser(base_url, args.block, args.zabbix_host, args.zabbix_port)
    parser.feed(htmldata.read())

    if args.send_mode.upper() == "TRUE":
        for url in parser.url_list:
            get_rss_th = threading.Thread(target=parser.get_rss,name="get_rss_th", args=(url,))
            get_rss_th.start()

    if args.send_mode.upper() == "FALSE":
        print json.dumps(parser.lld_json)
        
    parser.close()
    htmldata.close()
コード例 #18
0
ファイル: jenkins_parser.py プロジェクト: xcalar/xcalar-infra
        self.result = []

    def handle_data(self, data):
        self.result.append(data)

    def get_result(self):
        return self.RESULT


# -------------------
# Main
# -------------------
if __name__ == '__main__':
    parser = MyHTMLParser()
    parser.feed(
        '<span class="timestamp"><b>18:09:35</b> </span>[2020-04-30T18:09:35 -0700] Starting caddy in /tmp/caddy-1000/8443 on port 8443'
    )  # you can use multiple 'xxx' 'ccc' '3333' into the feed

    print(
        "------------------------------------------------------------------------------------"
    )
    parser.feed(
        '<span class="timestamp"><b>01:06:08</b> </span>io/test_preview.py::testPreviewCorrectness[100-/home/jenkins/workspace/XCETest/buildOut/src/data/qa/csvSanity-csv-columnNameSpace.csv-30-simpleDataset-columnNameSpace.csv] PASSED [ 97%]'
    )
    print(
        "------------------------------------------------------------------------------------"
    )
    parser.feed(
        '<span class="timestamp"><b>01:06:17</b> </span>io/test_preview.py::testPreviewNotExists PASSED                          [ 98%]'
    )
    print(