Beispiel #1
0
def scrape(html_content):
    """
    This function could have been a "private" utility but I left it "public" (note the quotation marks, as in Python
    there is no such thing as public or private access identifiers) because this scraping functionality could certainly
    be invoked passing raw HTML content, effectively circumventing the actual HTTP request. This has been useful, for
    instance, for unit testing this behavior.
    :param html_content: a string containing the actual HTML content. Cannot be None or empty.
    :return: a dictionary with two keys: "total" and "top5", containing the total number of elements and the count of
    the top 5 ones, respectively.
    """
    if not html_content:
        raise ValueError('Input is empty')

    parser = HtmlElementsCounter()
    parser.feed(html_content)
    parser.close()  # instructs the parser to consume the input entirely

    total = sum(parser.occurrences_by_tag.values())

    # if the input only has N different elements (N < 5), this dictionary will hold exactly N entries
    top5_elements_with_occurrences = sorted(parser.occurrences_by_tag.items(),
                                            reverse=True,
                                            key=lambda x: x[1])[:5]

    return dict(total=total, top5=top5_elements_with_occurrences)
Beispiel #2
0
def get_dependencies(path):
    deps = set()
    parser = DependenciesParser(deps.add)
    with open(path) as f:
        parser.feed(f.read())
    parser.close()
    return iter(deps)
Beispiel #3
0
 def auth_usr(email, password, client_id, scope, opener):
     print("TRY TO AUTH")
     #TODO словить эксепшн
     login_page = "http://oauth.vk.com/oauth/authorize?" + \
             "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \
             "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope))
     #print(login_page)
     auth_page = opener.open("http://oauth.vk.com/oauth/authorize?" + \
             "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \
             "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope)))
     auth_page = auth_page.read()
     parser = AuthParser()
     parser.feed(str(auth_page))
     parser.close()
     if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \
           "email" not in parser.params or parser.method != "POST":
         parser.error = "Some problems"
     if parser.error != "OK":
         return -1, -1, parser.error
     parser.params["email"] = email
     parser.params["pass"] = password
     parser.params["v"] = "5.2"
     #TODO словить эксепшн
     response = opener.open(parser.url, urllib.parse.urlencode(parser.params).encode("UTF-8"))
     page = response.read()
     url = response.geturl()
     return page, url, parser.error
Beispiel #4
0
    def getentry(self):
        # Start with the entry from the parent.
        entry = FileHandler.getentry(self)
        parser = HTMLTitleParser()
        file = self.vfs.open(self.getselector(), "rt")
        try:
            while not parser.gotcompletetitle:
                line = file.readline()
                if not line:
                    break
                parser.feed(line)
            parser.close()
        except html.parser.HTMLParseError:
            # Parse error?  Stop parsing, go to here.  We can still
            # return a title if the parse error happened after we got
            # the title.
            pass

        file.close()
        # OK, we've parsed the file and exited because of either an EOF
        # or a complete title (or error).  Now, figure out what happened.

        if parser.gotcompletetitle:
            # Convert all whitespace sequences to a single space.
            # Removes newlines, tabs, etc.  Good for presentation
            # and for security.
            title = re.sub('[\s]+', ' ', parser.titlestr)
            entry.setname(title)
        return entry
Beispiel #5
0
        def run(self):

            self.parent.visited+=[self.url]

            if self.gleb>0:
                parser = self.HParser(self.url)

                contents = ""
                try:
                    response = urlopen(self.url)
                    contents = response.read().decode('utf-8')
                except HTTPError:
                    pass
                parser.feed(contents)

                parser.close()
                for v in parser.vals: #przeszukujemy podstrony
                    if not v in self.parent.visited:
                        thr = Probe.Thr(v,self.fun,self.gleb-1, self.parent)
                        thr.start()
                        self.parent.threads += [thr]
                        self.children += [thr]

            self.fun(self.url) #uruchamiamy akcje na biezacej stronie

            #czekamy na watki potomne
            for t in self.children:
                t.join()
Beispiel #6
0
    def _query_eol_list(self) -> typing.List[str]:
        """Scrape the FreeBSD website and return a list of EOL RELEASES."""
        request = urllib.request.Request(
            self.eol_url,
            headers={
                "Accept-Charset": "utf-8"
            }
        )
        self.logger.verbose(f"Downloading EOL info from {self.eol_url}")
        with urllib.request.urlopen(request) as response:  # nosec: B310

            response_code = response.getcode()
            if response_code != 200:  # noqa: T484
                libioc.errors.DownloadFailed(
                    topic="EOL Warnings",
                    code=response_code,
                    logger=self.logger,
                    level="warning"
                )
                return []

            parser = EOLParser()
            data = response.read().decode("utf-8", "ignore")
            parser.feed(data)
            parser.close()

            return parser.eol_releases
Beispiel #7
0
def simplify_html(s):
    """Make a real HTML text compatible with Telegram's pseudo-HTML"""

    parser = _HtmlSimplifying()
    parser.feed(s)
    parser.close()
    return parser.result
def main():
    """Make all of the above work together to finally print the RSS feed."""
    # Initial request
    html_string = get_response_body('/archive?type=episodes')
    # Prepare headers for following requests
    HEADERS['Referer'] = (
        'https://www.thisamericanlife.org/archive?type=episodes'
    )
    HEADERS['X-Requested-With'] = 'XMLHttpRequest'
    parser = Parser()
    parser.feed(html_string)
    tree = parser.close()
    episodes = findall_episodes(tree)
    count = tree.find('.//div[@class="count-sort"]/div[@class="count"]').text
    count = int(count.split()[2])
    for page in range(int(count / 48)):
        page = page + 1
        time.sleep(1)
        json_string = get_response_body(f'/archive?type=episodes&page={page}')
        html_string = json.loads(json_string)['html']
        parser = Parser()
        parser.feed(html_string)
        tree = parser.close()
        new_episodes = findall_episodes(tree)
        episodes = episodes + new_episodes

    RSS['rss']['channel']['item'] = episodes
    xml_tree = dictionary_to_xml(RSS)
    xml_string = xml.etree.ElementTree.tostring(
        xml_tree, encoding='utf-8', method='xml'
    ).decode()
    print(xml_string)
 def _run_check(self, source, expected_events, collector=EventCollector):
     parser = collector()
     for s in source:
         parser.feed(s)
     parser.close()
     events = parser.get_events()
     if events != expected_events:
         self.fail("received events did not match expected events\n"
                   "Expected:\n" + pprint.pformat(expected_events) +
                   "\nReceived:\n" + pprint.pformat(events))
Beispiel #10
0
 def _run_check(self, source, expected_events, collector=EventCollector):
     parser = collector()
     for s in source:
         parser.feed(s)
     parser.close()
     events = parser.get_events()
     if events != expected_events:
         self.fail("received events did not match expected events\n"
                   "Expected:\n" + pprint.pformat(expected_events) +
                   "\nReceived:\n" + pprint.pformat(events))
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r', encoding='utf8')
     data = f.read()
     f.close()
     parser = MyHTMLParser(
         formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.get_anchorlist()
Beispiel #12
0
 def access(page, opener):
     parser = AuthParser()
     parser.feed(str(page))
     parser.close()
     if not parser.form_parsed or parser.url is None or parser.method != "POST":
         parser.error = "Problems with giving access"
         return -1, -1, parser.error
     #TODO словить эксепшн
     response = opener.open(parser.url, urllib.parse.urlencode(parser.params).encode("UTF-8"))
     return response.geturl()
 def parse_links(self):
     'Parse out the links found in downloaded HTML file'
     f = open(self.file, 'r', encoding='utf8')
     data = f.read()
     f.close()
     parser = MyHTMLParser(formatter.AbstractFormatter(
         formatter.DumbWriter(io.StringIO())))
     parser.feed(data)
     parser.close()
     return parser.get_anchorlist()
Beispiel #14
0
def parse_mse_text(text, ignore_soft_newlines=True):
    parser = MSETextParser(ignore_soft_newlines=ignore_soft_newlines)
    parser.feed(text)
    parser.close()
    result = parser.result.replace('•', '\n•') # add line breaks before bullet points because the parser removes soft line breaks
    while '  ' in result:
        result = result.replace('  ', ' ') # remove double spaces that can be generated by replacing soft newlines with spaces
    result = result.replace(' \n', '\n') # remove spaces before newlines
    result = result.replace('\n ', '\n') # remove spaces after newlines
    result = result.strip(' ') # remove spaces at start/end
    return result.replace('“', '"').replace('”', '"').replace('‘', "'").replace('’', "'"), parser.color_identity
Beispiel #15
0
def ExtractNamespaces(html):
    """
    Extract the list of namespaces from an html page.
    """
    try:
        parser = NamespacesFilter()
        parser.feed(html)
    finally:
        parser.close()

    return parser.namespaces
Beispiel #16
0
 def give_access(doc, opener):
     parser = FormParser()
     parser.feed(doc.decode('utf-8'))
     parser.close()
     if not parser.form_parsed or parser.url is None:
           raise RuntimeError("Something wrong")
     if parser.method == "POST":
         response = opener.open(parser.url, urllib.parse.urlencode(parser.params).encode('utf-8'))
     else:
         raise NotImplementedError("Method '%s'" % params.method)
     return response.geturl()
Beispiel #17
0
 def _give_access(self, doc):
     parser = _FormParser()
     parser.feed(str(doc))
     parser.close()
     if not parser.form_parsed or parser.url is None:
         raise VKAuthError('Invalid email or password')
     if parser.method == 'post':
         response = self._opener.open(parser.url, urllib.parse.urlencode(parser.params).encode())
     else:
         raise VKAuthError('Unexpected method: ' + parser.method)
     return response.geturl()
def parseFileRefs(htmlfile, usedFiles, skipFiles, indent, trace=print):
    """
    find files referenced in root, recur for html files
    """
    trace("%sParsing:" % ("." * indent), htmlfile)
    parser = MyParser(usedFiles, skipFiles, indent)
    text = open(htmlfile).read()
    try:
        parser.feed(text)
    except html.parser.HTMLParseError as E:
        print("==>FAILED:", E)  # file's refs may be missed!
    parser.close()
Beispiel #19
0
    def __give_access(self, doc):
        parser = FormParser()
        parser.feed(str(doc))
        parser.close()
        if not parser.form_parsed or parser.url is None:
              raise VKAuthError(4, "Invalid email or password")
        if parser.method == "post":
            response = self.__opener.open(parser.url, urllib.parse.urlencode(parser.params).encode())
        else:
            raise VKAuthError(5, "Method "+parser.method)

        return response.geturl()        
Beispiel #20
0
def parseFileRefs(htmlfile, usedFiles, skipFiles, indent, trace=print):
    """
    find files referenced in root, recur for html files
    """
    trace('%sParsing:' % ('.' * indent), htmlfile)
    parser = MyParser(usedFiles, skipFiles, indent)
    text = open(htmlfile).read()
    try:
        parser.feed(text)
    except html.parser.HTMLParseError as E:
        print('==>FAILED:', E)  # file's refs may be missed!
    parser.close()
Beispiel #21
0
    def retrieve_capabilities(self,
                              url,
                              urlchain=[],
                              pool=None,
                              identity=None):
        """
        connect to the given URL, retrieve and process the
        capabilities/withdrawals found there
        """

        # detect loops in capability links
        if url in urlchain:
            return

        if not self._default_url:
            self.set_default_url(url)

        if isinstance(url, str):
            url = urllib3.util.parse_url(url)

        if identity is None:
            identity = self._tls_state.extract_peer_identity(url)

        if pool is None:
            if url.host is not None:
                pool = self._tls_state.pool_for(url.scheme, url.host, url.port)
            else:
                print("ConnectionPool not defined")
                exit(1)

        if url.path is not None:
            path = url.path
        else:
            path = "/"
        res = pool.request('GET', path)

        if res.status == 200:
            ctype = res.getheader("Content-Type")
            if ctype == "application/x-mplane+json":
                # Probably an envelope. Process the message.
                self.handle_message(
                    mplane.model.parse_json(res.data.decode("utf-8")),
                    identity)
            elif ctype == "text/html":
                # Treat as a list of links to capability messages.
                parser = CrawlParser(strict=False)
                parser.feed(res.data.decode("utf-8"))
                parser.close()
                for capurl in parser.urls:
                    self.retrieve_capabilities(url=capurl,
                                               urlchain=urlchain + [url],
                                               pool=pool,
                                               identity=identity)
Beispiel #22
0
    def retrieve_capabilities(self, url, urlchain=[], pool=None, identity=None):
        """
        connect to the given URL, retrieve and process the
        capabilities/withdrawals found there
        """

        # detect loops in capability links
        if url in urlchain:
            return

        if not self._default_url:
            self.set_default_url(url)

        if isinstance(url, str):
            url = urllib3.util.parse_url(url)

        if identity is None:
            identity = self._tls_state.extract_peer_identity(url)

        if pool is None:
            if url.host is not None:
                pool = self._tls_state.pool_for(url.scheme, url.host, url.port)
            else:
                print("ConnectionPool not defined")
                exit(1)

        if url.path is not None:
            path = url.path
        else:
            path = "/"
        
        
        print("Client path: "+ path)
        res = pool.request('GET', path)

        if res.status == 200:
            #ctype = res.getheader("Content-Type")
            ctype = res.headers['content-type']
            print("Response:    " + str(res.data))
            print("Response content type: " + str(ctype))
            if ctype == "application/x-mplane+json":
                # Probably an envelope. Process the message.
                self.handle_message(
                    mplane.model.parse_json(res.data.decode("utf-8")), identity)
            elif ctype == "text/html":
                # Treat as a list of links to capability messages.
                parser = CrawlParser(strict=False)
                parser.feed(res.data.decode("utf-8"))
                parser.close()
                for capurl in parser.urls:
                    self.retrieve_capabilities(url=capurl,
                                               urlchain=urlchain + [url],
                                               pool=pool, identity=identity)
Beispiel #23
0
 def _give_access(self, doc, opener):
     parser = FormParser()
     parser.feed(doc.decode(encoding='UTF-8'))
     parser.close()
     if not parser.form_parsed or parser.url is None:
         raise RuntimeError("Something wrong")
     if parser.method.upper() == "POST":
         request_data = urllib.parse.urlencode(parser.params).encode(
             "utf-8")
         response = opener.open(parser.url, request_data)
     else:
         raise NotImplementedError("Method '%s'" % parser.method)
     return response.geturl()
Beispiel #24
0
 def _give_access(self, doc, opener):
     parser = FormParser()
     parser.feed(doc.decode(encoding='UTF-8'))
     parser.close()
     if not parser.form_parsed or parser.url is None:
         raise RuntimeError("Something wrong")
     if parser.method.upper() == "POST":
         request_data = urllib.parse.urlencode(
             parser.params).encode("utf-8")
         response = opener.open(parser.url, request_data)
     else:
         raise NotImplementedError("Method '%s'" % parser.method)
     return response.geturl()
Beispiel #25
0
 def _run_check(self, source, expected_events, collector=None):
     if collector is None:
         collector = self.get_collector()
     parser = collector
     for s in source:
         parser.feed(s)
     parser.close()
     events = parser.get_events()
     if events != expected_events:
         self.fail("received events did not match expected events" +
                   "\nSource:\n" + repr(source) +
                   "\nExpected:\n" + pprint.pformat(expected_events) +
                   "\nReceived:\n" + pprint.pformat(events))
Beispiel #26
0
 def give_access(doc, opener):
     parser = FormParser()
     parser.feed(doc.decode('utf-8'))
     parser.close()
     if not parser.form_parsed or parser.url is None:
         raise RuntimeError("Something wrong")
     if parser.method == "POST":
         response = opener.open(
             parser.url,
             urllib.parse.urlencode(parser.params).encode('utf-8'))
     else:
         raise NotImplementedError("Method '%s'" % params.method)
     return response.geturl()
Beispiel #27
0
def ExtractAllPages(html):
    """
    Extract the list of wiki pagenames from an AllPages html output.
    """
    if debug:
        print("extract all pages, htmlsize=%d" % len(html))
    try:
        parser = AllpagesFilter()
        parser.feed(html)
    finally:
        parser.close()

    return parser
Beispiel #28
0
 def _run_check(self, source, expected_events, collector=None):
     if collector is None:
         collector = self.get_collector()
     parser = collector
     for s in source:
         parser.feed(s)
     parser.close()
     events = parser.get_events()
     if events != expected_events:
         self.fail("received events did not match expected events" +
                   "\nSource:\n" + repr(source) +
                   "\nExpected:\n" + pprint.pformat(expected_events) +
                   "\nReceived:\n" + pprint.pformat(events))
def html_to_text(html, maxcol=80):
    try:
        buffer = StringIO()
        formatter = Formatter.AbstractFormatter(TextWriter(buffer, maxcol))
        parser = HTMLParserAnchor(formatter)
        parser.feed(html)
        parser.close()
        text = buffer.getvalue()
        buffer.close()
        return text
    except Exception as e:
        syslog.syslog(syslog.LOG_ERR, 'cannot convert html to text: %s' % e)
        return None
Beispiel #30
0
def html_to_text(html_text):
    """
    Convert HTML to plain text.

    :param html_text: A fragment of HTML (a string).
    :returns: The plain text (a string).

    This function uses the :class:`HTMLStripper` class that builds on top of
    the :class:`html.parser.HTMLParser` class in the Python standard library.
    """
    parser = HTMLStripper()
    parser.feed(html_text)
    parser.close()
    return parser.output.getvalue()
Beispiel #31
0
def ExtractBaseurl(html):
    """
    Finds wiki baseurl on html page
    """
    try:
        parser = BaseurlFilter()
        parser.feed(html)
    finally:
        parser.close()

    if len(parser.baseurl)>1:
        print("baseurl: found multiple", parser.baseurl)
    if len(parser.baseurl)==0:
        raise Exception("no baseurl found")

    return max(parser.baseurl.items(), key=lambda kv:kv[1])[0]
Beispiel #32
0
    def retrieve_capabilities(self, url, urlchain=[], pool=None, identity=None):
        """
        Connect to the given URL, retrieve and process the
        capabilities/withdrawals found there
        """

        # detect loops in capability links
        if url in urlchain:
            return

        if not self._default_url:
            self.set_default_url(url)

        if isinstance(url, str):
            url = urllib3.util.parse_url(url)

        if identity is None:
            identity = self._tls_state.extract_peer_identity(url)

        if pool is None:
            if url.host is not None:
                pool = self._tls_state.pool_for(url.scheme, url.host, url.port)
            else:
                raise ValueError("HttpInitiatorClient capability retrieval missing connection pool")

        if url.path is not None:
            path = url.path
        else:
            path = "/"
        res = pool.request('GET', path)

        if res.status == 200:
            ctype = res.getheader("Content-Type")
            if ctype == "application/x-mplane+json":

                # Probably an envelope. Process the message.
                self.handle_message(
                    mplane.model.parse_json(res.data.decode("utf-8")), identity)
            elif ctype == "text/html":
                # Treat as a list of links to capability messages.
                parser = CrawlParser()
                parser.feed(res.data.decode("utf-8"))
                parser.close()
                for capurl in parser.urls:
                    self.retrieve_capabilities(url=capurl,
                                               urlchain=urlchain + [url],
                                               pool=pool, identity=identity)
Beispiel #33
0
    def _get_eol_list(self) -> typing.List[str]:
        """Scrapes the FreeBSD website and returns a list of EOL RELEASES"""
        request = urllib.request.Request(self.eol_url,
                                         headers={"Accept-Charset": "utf-8"})
        with urllib.request.urlopen(request) as response:  # nosec: B310

            if response.getcode() != 200:  # noqa: T484
                iocage.lib.errors.DistributionEOLWarningDownloadFailed(
                    logger=self.logger, level="warning")
                return []

            parser = EOLParser()
            data = response.read().decode("utf-8", "ignore")
            parser.feed(data)
            parser.close()

            return parser.eol_releases
Beispiel #34
0
 def auth_user(email, password, client_id, scope, opener):
     response = opener.open(
         "http://oauth.vk.com/oauth/authorize?" + \
         "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \
         "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope))
         )
     doc = response.read()
     parser = FormParser()
     parser.feed(doc.decode('utf-8'))
     parser.close()
     if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \
       "email" not in parser.params:
           raise RuntimeError("Something wrong")
     parser.params["email"] = email
     parser.params["pass"] = password
     if parser.method == "POST":
         response = opener.open(parser.url, urllib.parse.urlencode(parser.params).encode('utf-8'))
     else:
         raise NotImplementedError("Method '%s'" % parser.method)
     return response.read(), response.geturl()
Beispiel #35
0
 def crawlSuperPingProxyList(self, target):
     self.currentHost = self.SuperPing
     self.client = http.client.HTTPConnection(self.host[self.currentHost], self.port, self.timeout)
     try:
         self.startRequest("GET", "/?ping={0}&locale=en".format(target))
         self.getRawData("")
         parser = WebHTMLParser()
         parser.feed(self.rawData)
         parser.close()
         mdata = parser.get_parsed_data()
         pingData = []
         while re.search("load\(\'\.([^\']+)", mdata):
             sm = re.search("load\(\'\.([^\']+)", mdata)
             mdata = mdata[sm.end(1):]
             pingData.append(sm.group(1))
         self.superPing(pingData)
     except TimeoutError as err:
         logger.log(logging.ERROR, err)
     except Exception as err:
         logger.log(logging.ERROR, err)
Beispiel #36
0
def get_county(location):
    state_file = open(utils.get_project_dir() / "state_file.txt", "r")
    state_long_names_to_names = json.loads(state_file.read())
    state_file.close()

    url_base = "http://www.openstreetmap.org/geocoder/search_osm_nominatim_reverse"
    url = form_url(
        url_base, {
            "lat": location.latitude,
            "lon": location.longitude,
            "zoom": 17,
            "minlon": -122,
            "minlat": 47,
            "maxlon": -121,
            "maxlat": 48
        })

    print(url)

    response = get_web_resource(url)
    print(response)

    class StreamingHTMLParser(html.parser.HTMLParser):
        def __init__(self):
            html.parser.HTMLParser.__init__(self)
            self.data = None

        def handle_starttag(self, tag_name, attributes):
            attributes_d = dict(attributes)
            if tag_name == "a" and "data-name" in attributes_d:
                self.data = attributes_d["data-name"]

    parser = StreamingHTMLParser()
    parser.feed(response)
    parser.close()

    match = re.search("(\w* County), ([^,]*)", parser.data)
    county_name = match.group(1)
    state_long_name = match.group(2)
    state_name = state_long_names_to_names[state_long_name]
    return (county_name, state_name)
Beispiel #37
0
def get_song_infos_from_deezer_website(search_type, id):
    # search_type: either one of the constants: TYPE_TRACK|TYPE_ALBUM|TYPE_PLAYLIST
    # id: deezer_id of the song/album/playlist (like https://www.deezer.com/de/track/823267272)
    # return: if TYPE_TRACK => song (dict grabbed from the website with information about a song)
    # return: if TYPE_ALBUM|TYPE_PLAYLIST => list of songs
    # raises
    # Deezer404Exception if
    # 1. open playlist https://www.deezer.com/de/playlist/1180748301 and click on song Honey from Moby in a new tab:
    # 2. Deezer gives you a 404: https://www.deezer.com/de/track/68925038
    # Deezer403Exception if we are not logged in

    url = "https://www.deezer.com/de/{}/{}".format(search_type, id)
    resp = session.get(url)
    if resp.status_code == 404:
        raise Deezer404Exception(
            "ERROR: Got a 404 for {} from Deezer".format(url))
    if "MD5_ORIGIN" not in resp.text:
        raise Deezer403Exception(
            "ERROR: we are not logged in on deezer.com. Please update the cookie"
        )

    parser = ScriptExtractor()
    parser.feed(resp.text)
    parser.close()

    songs = []
    for script in parser.scripts:
        regex = re.search(r'{"DATA":.*', script)
        if regex:
            DZR_APP_STATE = json.loads(regex.group())
            global album_Data
            album_Data = DZR_APP_STATE.get("DATA")
            if DZR_APP_STATE['DATA']['__TYPE__'] == 'playlist' or DZR_APP_STATE[
                    'DATA']['__TYPE__'] == 'album':
                # songs if you searched for album/playlist
                for song in DZR_APP_STATE['SONGS']['data']:
                    songs.append(song)
            elif DZR_APP_STATE['DATA']['__TYPE__'] == 'song':
                # just one song on that page
                songs.append(DZR_APP_STATE['DATA'])
    return songs[0] if search_type == TYPE_TRACK else songs
Beispiel #38
0
 def crawlSuperPingProxyList(self, target):
     self.currentHost = self.SuperPing
     self.client = http.client.HTTPConnection(self.host[self.currentHost],
                                              self.port, self.timeout)
     try:
         self.startRequest("GET", "/?ping={0}&locale=en".format(target))
         self.getRawData("")
         parser = WebHTMLParser()
         parser.feed(self.rawData)
         parser.close()
         mdata = parser.get_parsed_data()
         pingData = []
         while re.search("load\(\'\.([^\']+)", mdata):
             sm = re.search("load\(\'\.([^\']+)", mdata)
             mdata = mdata[sm.end(1):]
             pingData.append(sm.group(1))
         self.superPing(pingData)
     except TimeoutError as err:
         logger.log(logging.ERROR, err)
     except Exception as err:
         logger.log(logging.ERROR, err)
Beispiel #39
0
def fragment_fromstring(text, parser=None, create_parent=None):
    """
    Returns an HTML fragment from a string. The fragment must contain just a single element, unless create_parent is given; e.g,. fragment_fromstring(string, create_parent='div') will wrap the element in a <div>.
    """
    if parser is None:
        parser = HTMLParser(TreeBuilder())
    if create_parent:
        parser.feed("<%s>" % create_parent)
    parser.feed(text)
    if create_parent:
        parser.feed("</%s>" % create_parent)
    return parser.close()
Beispiel #40
0
def fragment_fromstring(text, parser=None, create_parent=None):
    """
    Returns an HTML fragment from a string. The fragment must contain just a single element, unless create_parent is given; e.g,. fragment_fromstring(string, create_parent='div') will wrap the element in a <div>.
    """
    if parser is None:
        parser = HTMLParser(TreeBuilder())
    if create_parent:
        parser.feed("<%s>" % create_parent)
    parser.feed(text)
    if create_parent:
        parser.feed("</%s>" % create_parent)
    return parser.close()
  def __init__(self, url, numCols, extractionMap, exceptions):
    # Request the html.
    request = urllib.request.Request(url)
    request.add_header("User-Agent",self.user_agent)
    try:
      response = urllib.request.urlopen(request)
    except:
      print("Error: Invalid URL. Exiting.")
      exit()
    htmlContent = response.read().decode("utf8")

    # Some files have <br> in the middle of a <td> tag,
    # and cause the parser to misinterpret the data.
    htmlContent = htmlContent.replace("<br>", "")

    # Parse the html.
    parser = CountryParser(numCols, extractionMap, exceptions, strict=False)
    htmlContent = parser.unescape(htmlContent) # Unescape HTML entities.
    parser.feed(htmlContent)
    parser.close()
    self.__countryData = parser.countryData
Beispiel #42
0
def discover(url, timeout=None):
    """Discover the hub url and topic url of a given url. Firstly, by inspecting
    the page's headers, secondarily by inspecting the content for link tags.

    timeout determines how long to wait for the url to load. It defaults to 3.

    """
    resp = get_content({'REQUEST_TIMEOUT': timeout}, url)

    parser = LinkParser()
    parser.hub_url = (resp.links.get('hub') or {}).get('url')
    parser.topic_url = (resp.links.get('self') or {}).get('url')
    try:
        parser.updated()
        for chunk in resp.iter_content(chunk_size=None, decode_unicode=True):
            parser.feed(chunk)
        parser.close()
    except Finished:
        return {'hub_url': parser.hub_url, 'topic_url': parser.topic_url}

    raise DiscoveryError("Could not find hub url in topic page")
Beispiel #43
0
def _query_from_ip_cn(ip_str: str):
    command = """curl
    'https://ip.cn/?ip=%(ip)s'
  -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
  -H 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
  -H 'sec-fetch-site: same-origin'
  -H 'sec-fetch-mode: navigate'
  -H 'sec-fetch-user: ?1'
  -H 'sec-fetch-dest: document'
  -H 'referer: https://ip.cn/?ip=%(ip)s'
  -H 'accept-language: zh-CN,zh;q=0.9,en;q=0.8'
  --compressed
  -s
  """

    result = dict()

    try:
        ips = ip_str
        str_command = command % dict(ip=ips)
        str_command = " ".join(
            filter(lambda x: x, map(str.strip, str_command.split("\n"))))
        with os.popen(str_command) as inf:
            ctx = inf.read()

        parser = _parser_cls()
        parser.feed(ctx)
        parser.close()
        ret_list = parser.get_ret_list()
        # result['ip'] = ret_list[0]
        result['desc_zh'] = ret_list[1]
        cpc = str.split(ret_list[2], ',') if len(ret_list) > 2 else []
        result['city'] = str.strip(cpc[-3] if len(cpc) >= 3 else '')
        result['province'] = str.strip(cpc[-2] if len(cpc) >= 2 else '')
        result['country'] = str.strip(cpc[-1] if len(cpc) >= 1 else '')
    except Exception as err:
        result['error'] = str(err)
        pass

    return result
def get_windows_table():
    global _windows_table
    # If we already loaded _windows_table, no need to load it all over again.
    if _windows_table:
        return _windows_table

    # windows-rcs.html was fetched on 2015-03-24 with the following command:
    # curl -o windows-rcs.html \
    #         https://msdn.microsoft.com/en-us/library/cc704588.aspx
    parser = TableParser()
    with open(os.path.join(os.path.dirname(__file__),
                           "windows-rcs.html")) as hf:
        # We tried feeding the file data to TableParser in chunks, to avoid
        # buffering the entire file as a single string. Unfortunately its
        # handle_data() cannot tell the difference between distinct calls
        # separated by HTML tags, and distinct calls necessitated by a chunk
        # boundary. Sigh! Read in the whole file. At the time this was
        # written, it was only 500KB anyway.
        parser.feed(hf.read())
    parser.close()
    table = parser.table

    # With our parser, any <tr><th>...</th></tr> row leaves a table entry
    # consisting only of an empty list. Remove any such.
    while table and not table[0]:
        table.pop(0)

    # We expect rows of the form:
    # [['0x00000000', 'STATUS_SUCCESS'],
    #  ['The operation completed successfully.']]
    # The latter list will have multiple entries if Microsoft embedded <br/>
    # or <p> ... </p> in the text, in which case joining with '\n' is
    # appropriate.
    # Turn that into a dict whose key is the hex string, and whose value is
    # the pair (symbol, desc).
    _windows_table = dict(
        (key, (symbol, '\n'.join(desc))) for (key, symbol), desc in table)

    return _windows_table
Beispiel #45
0
 def auth_user(email, password, client_id, scope, opener):
     response = opener.open(
         "http://oauth.vk.com/oauth/authorize?" + \
         "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \
         "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope))
         )
     doc = response.read()
     parser = FormParser()
     parser.feed(doc.decode('utf-8'))
     parser.close()
     if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \
       "email" not in parser.params:
         raise RuntimeError("Something wrong")
     parser.params["email"] = email
     parser.params["pass"] = password
     if parser.method == "POST":
         response = opener.open(
             parser.url,
             urllib.parse.urlencode(parser.params).encode('utf-8'))
     else:
         raise NotImplementedError("Method '%s'" % parser.method)
     return response.read(), response.geturl()
Beispiel #46
0
def get_tests(self, task, i, j):
    proto = fetch_protocol(self, task, i, j).decode('utf-8', 'replace')
    parser = TestsParser()
    parser.feed(proto.split('\n', 2)[-1])
    parser.close()
    tests = []
    for i in parser.test_meta:
        meta = {}
        files = {}
        for k, v in i.items():
            if k == 'checker-comment': k = 'checker'
            if k in ('input', 'output', 'stderr', 'correct', 'checker'):
                files[k] = v
            else:
                meta[k] = v
        tests.append((meta, files))
    compiler_output = parser.global_meta.get('compiler_output', '')
    try:
        del parser.global_meta['compiler_output']
    except KeyError:
        pass
    return (compiler_output, parser.global_meta, tests)
Beispiel #47
0
def get_place_data(place_name):
    url_base = "http://openstreetmap.org/geocoder/search_osm_nominatim"
    url = form_url(url_base, {"query": place_name})

    print(url)

    response = get_web_resource(url)
    print(response)

    class StreamingHTMLParser(html.parser.HTMLParser):
        def __init__(self):
            html.parser.HTMLParser.__init__(self)
            self.data = None
            self.data_found = False

        def handle_starttag(self, tag_name, attributes):
            if self.data_found:
                return
            attributes_d = dict(attributes)
            if tag_name == "li":
                if self.data is None:
                    self.data = dict()
            if tag_name == "a" and "data-lat" in attributes_d:
                self.data["location"] = Location(attributes_d["data-lon"],
                                                 attributes_d["data-lat"])
                self.data_found = True

        def handle_data(self, chars):
            if self.data_found:
                return
            if self.data is not None:
                if "location" not in self.data:
                    self.data["usage"] = chars[:-1]

    parser = StreamingHTMLParser()
    parser.feed(response)
    parser.close()

    return parser.data
Beispiel #48
0
    def _auth_user(self):
        url = AUTH_URL % (self.app_id, self.scope)
        response = self.opener.open(url)
        doc = response.read().decode(encoding='UTF-8')
        parser = FormParser()
        parser.feed(doc)
        parser.close()

        if not parser.form_parsed \
                or parser.url is None \
                or "pass" not in parser.params \
                or "email" not in parser.params:
            raise RuntimeError("Something wrong")

        parser.params["email"] = self.login
        parser.params["pass"] = self.password
        if parser.method.upper() == "POST":
            request_data = urllib.parse.urlencode(parser.params).encode(
                "utf-8")
            response = self.opener.open(parser.url, request_data)
        else:
            raise NotImplementedError("Method '%s'" % parser.method)
        return response.read(), response.geturl()
Beispiel #49
0
    def _auth_user(self):
        url = AUTH_URL % (self.app_id, self.scope)
        response = self.opener.open(url)
        doc = response.read().decode(encoding='UTF-8')
        parser = FormParser()
        parser.feed(doc)
        parser.close()

        if not parser.form_parsed \
                or parser.url is None \
                or "pass" not in parser.params \
                or "email" not in parser.params:
            raise RuntimeError("Something wrong")

        parser.params["email"] = self.login
        parser.params["pass"] = self.password
        if parser.method.upper() == "POST":
            request_data = urllib.parse.urlencode(
                parser.params).encode("utf-8")
            response = self.opener.open(parser.url, request_data)
        else:
            raise NotImplementedError("Method '%s'" % parser.method)
        return response.read(), response.geturl()
Beispiel #50
0
    def getentry(self):
        # Start with the entry from the parent.
        entry = FileHandler.getentry(self)
        parser = HTMLTitleParser()

        with self.vfs.open(self.getselector(), "rb") as fp:
            while not parser.gotcompletetitle:
                line = fp.readline()
                if not line:
                    break
                # The PY3 HTML parser doesn't handle surrogateescape
                parser.feed(line.decode(errors="replace"))
            parser.close()

        # OK, we've parsed the file and exited because of either an EOF
        # or a complete title (or error).  Now, figure out what happened.

        if parser.gotcompletetitle:
            # Convert all whitespace sequences to a single space.
            # Removes newlines, tabs, etc.  Good for presentation
            # and for security.
            title = re.sub(r"[\s]+", " ", parser.titlestr)
            entry.setname(title)
        return entry
 def parse(source=source):
     parser = html.parser.HTMLParser()
     parser.feed(source)
     parser.close()
Beispiel #52
0
def auth(login, passwd, appid, scope):
    if not isinstance(scope, list):
        scope = [scope]

    _opener = urllib.request.build_opener(
        urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()),
        urllib.request.HTTPRedirectHandler())
    try:
        response = _opener.open(
            'http://oauth.vk.com/oauth/authorize?' + \
            'redirect_uri=oauth.vk.com/blank.html&response_type=token&' + \
            'client_id={0}&scope={1}&display=wap'.format(appid, ','.join(scope))
        )
    except urllib.error.URLError as e:
        raise VKAuthError('Cant connect to vk.com or app_id is invalid.')
    except Exception as e:
        raise VKAuthError('Unhandled exception: ' + str(e))

    doc = response.read().decode()
    parser = _FormParser()
    parser.feed(doc)
    parser.close()

    if not parser.form_parsed or parser.url is None or 'pass' not in parser.params or 'email' not in parser.params:
        raise VKAuthError('Unexpected response page o_O')

    parser.params['email'] = login
    parser.params['pass'] = passwd
    parser.method = 'POST'
    keys = [i for i in parser.params]
    for i in keys:
        b = '1'.encode()
        if type(i) != type(b):
            a = i.encode()
        else:
            a = i
        if type(parser.params[i]) != type(b):
            parser.params[a] = parser.params[i].encode()
        else:
            parser.params[a] = parser.params[i]
        parser.params.pop(i)

    response = _opener.open(parser.url, urllib.parse.urlencode(parser.params).encode())

    doc = response.read()
    url = response.geturl()

    if urllib.parse.urlparse(url).path != '/blank.html':
        parser = _FormParser()
        parser.feed(str(doc))
        parser.close()
        if not parser.form_parsed or parser.url is None:
            raise VKAuthError('Invalid email or password')
        if parser.method == 'post':
            response = _opener.open(parser.url, urllib.parse.urlencode(parser.params).encode())
        else:
            raise VKAuthError('Unexpected method: ' + parser.method)
        url = response.geturl()

    if urllib.parse.urlparse(url).path != "/blank.html":
        raise VKAuthError('Invalid email or password')

    answer = dict(tuple(kv_pair.split('=')) for kv_pair in urllib.parse.urlparse(url).fragment.split('&'))
    if 'access_token' not in answer or 'user_id' not in answer:
        raise VKAuthError('Missing some values in answer')

    return answer['access_token'], answer['user_id'], answer['expires_in']
Beispiel #53
0
 def parse(source=source):
     parser = self.get_collector()
     parser.feed(source)
     parser.close()
Beispiel #54
0
                "Referer": "http://mobile.9om.com/",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
                "Pragma": "no-cache",
                "Cache-Control": "no-cache",
            },
            method="GET",
        )

        try:
            with urllib.request.urlopen(req, timeout=33) as f:
                print("status", f.status, f.reason)
                res = f.read().decode("utf-8")
                parser = MyHTMLPraser()
                parser.feed(res)
                parser.close()
        except urllib.error.HTTPError as e:
            print("Server could not fulfill the request,erro code", e.code, e.info())
        except urllib.error.URLError as e:
            print("We failed to reach the server:", e.reason)
        except socket.timeout:
            print("Time out")
            # saveMyData() 如果超时你强制终止程序,会导致爬虫结果文件数据未写入变成空文件
            count = 0

        count += 1

        if count % 9 == 0:
            saveMyData()
            count = 0
Beispiel #55
0
def parser_test(html_str):
    #parser html source file
    parser = MyHTMLParser(strict = False)
    parser.feed(html_str)
    parser.close()
Beispiel #56
0
 def parse(source=source):
     parser = self.get_collector()
     parser.feed(source)
     parser.close()
Beispiel #57
0
def discover(url):
    '''Perform service discovery on the OP URL.
    Return list of service types, and the auth/2.0 URL,
    or None if discovery fails.'''
    scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
    assert not fragment
    if scheme == 'https':
        conn = http.client.HTTPSConnection(netloc)
    elif scheme == 'http':
        conn = http.client.HTTPConnection(netloc)
    else:
        raise ValueError("Unsupported scheme " + scheme)
    # conn.set_debuglevel(1)
    if query:
        path += '?'+query
    try:
        conn.connect()
    except:
        # DNS or TCP error
        return None
    # httplib in 2.5 incorrectly sends https port in Host
    # header even if it is 443
    conn.putrequest("GET", path, skip_host=1)
    conn.putheader('Host', netloc)
    conn.putheader('Accept', "text/html; q=0.3, "+
                   "application/xhtml+xml; q=0.5, "+
                   "application/xrds+xml")
    conn.endheaders()

    res = conn.getresponse()
    data = res.read()
    conn.close()

    if res.status in (301, 302, 303, 307):
        return discover(res.msg.get('location'))

    if sys.version_info < (3,0):
        content_type = res.msg.gettype()
    else:
        content_type = res.msg.get_content_type()

    # Yadis 6.2.5 option 2 and 3: header includes x-xrds-location
    xrds_loc = res.msg.get('x-xrds-location')
    if xrds_loc and content_type != 'application/xrds+xml':
        return discover(xrds_loc)

    if content_type in ('text/html', 'application/xhtml+xml'):
        parser = OpenIDParser()
        parser.feed(data.decode('latin-1'))
        parser.close()
        # Yadis 6.2.5 option 1: meta tag
        if parser.xrds_location:
            return discover(parser.xrds_location)
        # OpenID 7.3.3: attempt html based discovery
        op_endpoint = parser.links.get('openid2.provider')
        if op_endpoint:
            op_local = parser.links.get('openid2.local_id')
            return ['http://specs.openid.net/auth/2.0/signon'], op_endpoint, op_local
        # 14.2.1: 1.1 compatibility
        op_endpoint = parser.links.get('openid.server')
        if op_endpoint:
            op_local = parser.links.get('openid.delegate')
            return ['http://openid.net/signon/1.1'], op_endpoint, op_local
        # Discovery failed
        return None

    elif content_type == 'application/xrds+xml':
        # Yadis 6.2.5 option 4
        doc = ElementTree.fromstring(data)
        return _extract_services(doc)
    else:
        # unknown content type
        return None
    return services, op_endpoint, op_local
Beispiel #58
0
    def __init__(self, email, password, client_id, permissions):
        """
        VKAuth(email, password, application id, scope)
        Where scope is a list of permissions like ['friends', 'photos']
        If object is successfully initialised, "access token" will
        be in VKAuth.access_token and "user id" in VKAuth.user_id
        raises VKAuthError on errors
        """
        if not isinstance(permissions, list):
            permissions = [permissions]

        for element in permissions:
            if element not in self.valid_scope:
                raise VKAuthError(1, 'invalid scope element: '+element)

            self.__opener = urllib.request.build_opener(
                urllib.request.HTTPCookieProcessor(http.cookiejar.CookieJar()),
                urllib.request.HTTPRedirectHandler())

        try:
            response = self.__opener.open(
                "http://oauth.vk.com/oauth/authorize?" + \
                "redirect_uri=oauth.vk.com/blank.html&response_type=token&" + \
                "client_id={0}&scope={1}&display=wap".format(client_id, ",".join(permissions))
                )
        except urllib.error.URLError as E:
            raise VKAuthError(2, 'internet connection failed')
        except Exception as E:
            raise VKAuthError(0, 'Unhandled exception: '+str(e))

        doc = response.read().decode()
        parser = FormParser()
        parser.feed(doc)
        parser.close()
        if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \
          "email" not in parser.params:
              raise VKAuthError(3, 'wrong response page oO')
        parser.params["email"] = email
        parser.params["pass"] = password
        parser.method = 'POST'
        keys = [ i for i in parser.params]
        for i in keys:
            b = '1'.encode()
            if type(i)!=type(b):
                a = i.encode()
            else: a = i
            if type(parser.params[i])!=type(b):
                parser.params[a] = parser.params[i].encode()
            else:
                parser.params[a] = parser.params[i]
            parser.params.pop(i)

        try:
            response = self.__opener.open(parser.url, urllib.parse.urlencode(parser.params).encode())
        except urllib.error.URLError as E:
            raise VKAuthError(2, 'internet connection failed')
        except Exception as E:
            raise VKAuthError(0, 'Unhandled exception: '+str(e))        

        doc = response.read()
        url = response.geturl()

        if urllib.parse.urlparse(url).path != "/blank.html":
            url = self.__give_access(doc)
        
        if urllib.parse.urlparse(url).path != "/blank.html":
            raise VKAuthError(4, "Invalid email or password")

        def split_key_value(kv_pair):
            kv = kv_pair.split("=")
            return kv[0], kv[1]

        answer = dict(split_key_value(kv_pair) for kv_pair in urllib.parse.urlparse(url).fragment.split("&"))
        if "access_token" not in answer or "user_id" not in answer:
            raise VKAuthError(5, "Missing some values in answer")
        self.access_token = answer["access_token"]
        self.user_id = answer["user_id"]