def contains_one_link_to_diff(self, html_or_soup, package_diff):
     """Return whether the html contains a link to the diff content."""
     if not (isinstance(html_or_soup, BeautifulSoup)):
         soup = BeautifulSoup(html_or_soup)
     else:
         soup = html_or_soup
     return 1 == len(soup.findAll("a", href=package_diff.diff_content.http_url))
Beispiel #2
0
    def test_date_form(self):
        """
        Form with a date
        """
        schema = schemaish.Structure()
        schema.add("a", schemaish.Date())

        form_name = "form_name"
        form = formish.Form(schema, form_name)

        request_data = {"a": "1966-12-18"}
        expected_data = {"a": date(1966, 12, 18)}

        request = self.Request(form_name, request_data)

        try:
            data = form.validate(request)
        except:
            pass
        assert data == expected_data

        form.defaults = expected_data
        htmlsoup = BeautifulSoup(form())
        assert htmlsoup.findAll(id="form_name-a-field")[0]["class"] == "field date input"
        assert htmlsoup.findAll(id="form_name-a")[0]["value"] == "1966-12-18"
Beispiel #3
0
    def test_string_form(self):
        """
        Form with a couple of string fields
        """
        schema = schemaish.Structure()
        schema.add("fieldOne", schemaish.String())
        schema.add("fieldTwo", schemaish.String())

        form_name = "form_name"
        form = formish.Form(schema, form_name)

        request_data = {"fieldOne": "a", "fieldTwo": "b"}
        expected_data = {"fieldOne": "a", "fieldTwo": "b"}

        request = self.Request(form_name, request_data)

        data = form.validate(request)
        assert data == expected_data

        form.defaults = request_data
        htmlsoup = BeautifulSoup(form())
        assert htmlsoup.findAll(id="form_name-fieldOne-field")[0]["class"] == "field string input"
        assert htmlsoup.findAll(id="form_name-fieldTwo-field")[0]["class"] == "field string input"
        assert htmlsoup.findAll(id="form_name-fieldOne")[0]["value"] == "a"
        assert htmlsoup.findAll(id="form_name-fieldTwo")[0]["value"] == "b"
Beispiel #4
0
    def top_hundred(cls, num=DEFAULT_BILLBOARD_TOP_SONG_NUM):
        key = "billboard"
        if cache.get(key):
            return cache.get(key)[:num]

        url = "http://www1.billboard.com/rss/charts/hot-100"

        top_list = []
        try:
            data = urllib2.urlopen(url).read()
            soup = BeautifulSoup(data)
            h_list = soup.findAll("item")
            top_list = [cls.parse(item) for item in h_list]
            nodate_item = []
            withdate_item = []
            for item in top_list:
                if item["pub_date"]:
                    withdate_item.append(item)
                else:
                    nodate_item.append(item)

            if len(withdate_item) >= 1:
                for item in nodate_item:
                    item["pub_date"] = withdate_item[0]["pub_date"]
            else:
                raise Exception("Warning, no date info in Billboard")

            if len(filter(lambda x: x["pub_date"] == None, top_list)):
                raise Exception("Warning")

            cache.set(key, top_list, 3600)
            return top_list[:num]
        except urllib2.HTTPError, e:
            print "HTTP error: %d" % e.code
            return []
Beispiel #5
0
    def parse_page(self, url_page, results, race_info=None):
        """
        Scan one page fill the results
        :param url_page:
        :param results:
        :return:
        """
        try:
            f = urllib2.urlopen(url_page)
            content = f.read()
            f.close()
        except:
            print >> sys.stderr, "Error Url not found", url_page
            return False

        parsed_html = BeautifulSoup(content)
        # Check the page

        # Name of the race
        strinfo = parsed_html.table.td.text

        fin = True
        current, last = map(int, re.search("(\d+) de (\d+)", strinfo).groups())
        print >> sys.stderr, "# Name: %s (%d of %d)" % (strinfo, current, last)

        if current == last:
            fin = False
        t_results = parsed_html.findAll("table")[1]

        info_data = [
            ["pgeneral", int],
            ["pcat", int],
            ["cat", lambda x: self.in_a(x, self.my_str)],
            ["dorsal", int],
            None,
            ["tofficial", self.to_seconds],
            ["treal", self.to_seconds],
            ["club", lambda x: self.in_a(x, self.my_str)],
            None,
        ]

        for row in t_results.findAll("tr"):

            if not row.td:
                continue

            tds = row.findAll("td")
            d = {}

            for i, elem in enumerate(info_data):
                if elem:
                    value = tds[i].string
                    if not value:
                        value = tds[i]
                    d[elem[0]] = elem[1](value)
            dorsal = int(d["dorsal"])

            results[dorsal] = d

        return fin
Beispiel #6
0
def getPrePage(url):
    patt = re.compile(r"bbstdoc,board,PPPerson,page.*")
    htmlContent = htmlGraber.doGrab(url)
    soup = BeautifulSoup(htmlContent)
    content = soup.find("a", href=re.compile(patt))
    #    print bbsHome+str(content.attrs[0][1]);
    return bbsHome + str(content.attrs[0][1])
def pres_primary_dem_precinct():
    with open("20080126__sc__democratic__primary__president__precinct.csv", "wb") as csvfile:
        w = unicodecsv.writer(csvfile, encoding="utf-8")
        for county in COUNTIES:
            url = (
                "http://www.state.sc.us/cgi-bin/scsec/r208dpf?race=PRESIDENT&election=pri08dpf&county=%s&pr=dp" % county
            )
            r = requests.get(url)
            soup = BeautifulSoup(r.text)
            table = soup.find("table")
            rows = table.findAll("tr")[1:]
            if county == "ABBEVILLE":
                first_names = [x.text for x in rows[0].findAll("td") if x.text != ""]
                last_names = [x.text for x in rows[1].findAll("td") if x.text != ""][2:]
                candidates = [" ".join(x) for x in zip(first_names, last_names)]
                headers = ["county", "precinct", "office", "district", "party", "candidate", "votes"]
                w.writerow(headers)
            for row in rows[2:]:
                for candidate in candidates:
                    w.writerow(
                        [
                            row.findAll("td")[0].text,
                            row.findAll("td")[1].text,
                            "President",
                            None,
                            "DEM",
                            candidate,
                            row.findAll("td")[candidates.index(candidate) + 2].text,
                        ]
                    )
Beispiel #8
0
 def ParsePhotoPage(self, url):
     resp = urllib.urlopen(url)
     strain = SoupStrainer("div", {"id": "photoViewer"})
     soup = BeautifulSoup(resp.read(), strain, fromEncoding="utf-8")
     for item in soup.find("table", {"id": "tPicTop"}).findAll("td"):
         img = item.find("img")["src"]
         self.meta.m_backdrop_list.append((img, re.sub("S\d{3}x\d{3}", "image", img)))
    def save(self, manually_splitting=False, source_sentences=()):
        if not manually_splitting:
            # Tokenize the HTML that is fetched from a wiki article
            sentences = list()
            segment_id = 0
            soup = BeautifulSoup(self.source_text)
            sentence_splitter = determine_splitter(self.language)
            # initial save for foreign key based saves to work
            # save should occur after sent_detector is loaded
            super(SourceArticle, self).save()
            # find all paragraphs
            for p in soup.findAll("p"):
                only_p = p.findAll(text=True)
                p_text = "".join(only_p)
                # split all sentences in the paragraph

                sentences = sentence_splitter(p_text.strip())
                # TODO: remove bad sentences that were missed above
                sentences = [s for s in sentences if not re.match("^\**\[\d+\]\**$", s)]

                for sentence in sentences:
                    # Clean up bad spaces ( )
                    sentence = sentence.replace(" ", " ")

                    s = SourceSentence(article=self, text=sentence, segment_id=segment_id)
                    segment_id += 1
                    s.save()
                s.end_of_paragraph = True
                s.save()
            self.sentences_processed = True
        else:
            for sentence in source_sentences:
                sentence.save()
        super(SourceArticle, self).save()
Beispiel #10
0
    def retrieve_product_data(self, product_link):
        browser = mechanize.Browser()

        try:
            product_data = browser.open(product_link).get_data()
        except URLError:
            return None

        product_soup = BeautifulSoup(product_data)

        pn = product_soup.find("div", {"id": "destacadoRuta"})

        if not pn:
            return None

        pn = pn.find("a").string

        try:
            pn = pn.replace(" ", " ").replace("\r", " ").replace("\n", " ")
        except AttributeError:
            return None

        pn = " ".join(re.split("\s+", pn.replace("\t", " ")))
        product_name = pn.encode("ascii", "ignore")

        product_price = int(product_soup.find("div", {"class": "precio1"}).contents[2].replace(".", ""))

        product_data = ProductData()
        product_data.custom_name = product_name
        product_data.price = product_price
        product_data.url = product_link
        product_data.comparison_field = product_link

        return product_data
Beispiel #11
0
class XLBParser:
    def __init__(self, html):
        self._html = BeautifulSoup(html)

    def _get_xlbs_tag(self, tag, css_class):
        return self._html.findAll(tag, css_class)

    def _get_xlb_tag(self, tag, css_class):
        return self._html.find(tag, css_class)

    def get_xlb_info(self):
        divs = self._get_xlbs_tag("div", "textbox-content")
        for div in divs:
            image_url = "/%s" % div.first().img.get("src")
            info = div.text[div.text.index(">") + 1 :]
            one_url = div.find("a", attrs={"title": u"点击阅读全文"}).get("href")
            title = self._html.find("a", attrs={"href": one_url}).text
            yield info, title, "/%s" % one_url, image_url

    def get_xlb_page_info(self):
        # return str(url) , int(max_page_num)
        pages = self._get_xlb_tag("div", "pages")
        a = pages.findAll("a")
        urls = set(x.get("href") for x in a)

        urls_mode1 = [u for u in urls if "mode=2" not in u]
        querys = [urlparse.urlparse(u).query for u in urls_mode1]

        nums = dict(y.split("=") for x in querys if "&" in x for y in x.split("&"))

        return "/index.php?mode=1&page=%s", max(nums.values())
Beispiel #12
0
    def check_indeed(self, title, city):
        br = mechanize.Browser(factory=mechanize.RobustFactory())
        br.set_handle_robots(False)

        indeed_url = "http://www.indeed.com"

        br.open(indeed_url)

        br.form = list(br.forms())[0]

        br["q"] = title  # The What id
        br["l"] = city  # The Where id
        response = br.submit()
        print br.geturl()
        response = br.open(br.geturl() + "&limit=20")  # 20 items per page, this is good to keep only relevant items
        print br.geturl()
        response = response.read()

        soup = BeautifulSoup(response)

        titles_soup = soup.findAll("a", attrs={"data-tn-element": "jobTitle"})
        titles = [item.text for item in titles_soup]
        urls = ["http://www.indeed.com" + item.get("href") for item in titles_soup]
        companies = self._find_field_in_soup(soup, "company")
        locations = self._find_field_in_soup(soup, "location")
        summaries = self._find_field_in_soup(soup, "summary")
        dates = self._find_field_in_soup(soup, "date")

        return self._create_jobs_dict(
            title=titles, company=companies, location=locations, summary=summaries, date_posted=dates, job_url=urls
        )
Beispiel #13
0
def get_daily_specials(day=None):
    page = urlopen(URL)
    soup = BeautifulSoup(page)
    page.close()

    daily_specials = {
        "name": "Laco di Como",
        "specials": [],
        "streetaddress": "Timmervägen 6, Sundsvall",
        "dataurl": URL,
        "mapurl": "http://www.hitta.se/ViewDetailsPink.aspx?Vkiid=VgwibzXcvb%252fAf1XfiCvetg%253d%253d",
    }

    if day == None:
        day = date.today().weekday()

        # Only Monday - Friday
    if day > 4:
        return daily_specials

    day = [(u"Måndag", 2), (u"Tisdag", 2), (u"Onsdag", 2), (u"Torsdag", 2), (u"Fredag", 3)][day]
    ref = soup.find("h2", text=day[0]).parent
    daily_specials["specials"] = [li.text.strip() for li in ref.findNextSibling("ul") if isinstance(li, Tag)]

    return daily_specials
    def getLevelNotes(self, levelUrl):
        soup = BeautifulSoup(self.downloadWithRetry(levelUrl, 3))

        # this looked a lot nicer when I thought I could use BS4 (w/ css selectors)
        # unfortunately Anki is still packaging BS3 so it's a little rougher
        # find the words in column a, whether they be text, image or audio
        colAParents = map(lambda x: x.find("div"), soup.findAll("div", "col_a"))
        colA = map(lambda x: (x.string, self.TEXT_NOTE), filter(lambda p: p["class"] == "text", colAParents))
        colA.extend(
            map(lambda x: (x.find("img")["src"], self.IMAGE_NOTE), filter(lambda p: p["class"] == "image", colAParents))
        )
        colA.extend(
            map(lambda x: (x.find("a")["href"], self.AUDIO_NOTE), filter(lambda p: p["class"] == "audio", colAParents))
        )

        # same deal for column b
        colBParents = map(lambda x: x.find("div"), soup.findAll("div", "col_b"))
        colB = map(lambda x: (x.string, self.TEXT_NOTE), filter(lambda p: p["class"] == "text", colBParents))
        colB.extend(
            map(lambda x: (x.find("img")["src"], self.IMAGE_NOTE), filter(lambda p: p["class"] == "image", colBParents))
        )
        colB.extend(
            map(lambda x: (x.find("a")["href"], self.AUDIO_NOTE), filter(lambda p: p["class"] == "audio", colBParents))
        )

        # pair the "fronts" and "backs" of the notes up
        # this is actually the reverse of what you might expect
        # the content in column A on memrise is typically what you're
        # expected to *produce*, so it goes on the back of the note
        return map(lambda x: self.Note(x[1], x[0]), zip(colA, colB))
def _ft(tid):
    url = "http://m.yahoo.com/w/sports/ncaaf/team/ncaaf.t.%s" % (str(tid))
    request = urllib2.Request(
        url, headers={"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:17.0) Gecko/20100101 Firefox/17.0"}
    )
    html = urllib2.urlopen(request)
    html = html.read()
    soup = BeautifulSoup(html)
    tn = soup.find("div", attrs={"class": "uic title first"})
    if tn:
        team = tn.getText()
    else:
        team = None

    conf = soup.find("div", attrs={"class": "uic last"})
    if conf:
        cn = conf.getText()
        confnum = conf.find("a")["href"].split("?")[0]
        confnum = "".join(i for i in confnum if i.isdigit())
    else:
        conf = None

    if team and conf:
        return team, cn, confnum
    else:
        return None
Beispiel #16
0
def replaceURL(strURL):
    # Provide user feedback
    print bcolors.OKGREEN + "[+] " + bcolors.ENDC + "Replacing URLs..."
    print bcolors.OKGREEN + "[+] " + bcolors.ENDC + "URLs that will be replaced:"
    # Open source, read lines, and begin parsing to replace all URLs for scripts and links
    try:
        # Print href URLs that will be replaced
        print "\n".join(re.findall("<a href=\"?'?([^\"'>]*)", open("source.html").read()))
        with open("source.html", "r") as html:
            # Read in the source html and parse with BeautifulSoup
            soup = BeautifulSoup(html)
            # Find all links and replace URLs with our new text/URLs
            for link in soup.findAll("a", href=True):
                link["href"] = "{{links.phishgate}}"
            for link in soup.findAll("link", href=True):
                link["href"] = urlparse.urljoin(strURL, link["href"])
            for link in soup.findAll("script", src=True):
                link["src"] = urlparse.urljoin(strURL, link["src"])
            source = str(soup.prettify(encoding="utf-8"))
            # Write the updated URLs to source.html while removing the [' and ']
            output = open("index.html", "w")
            output.write(source.replace("[", "").replace("]", ""))
            output.close()
            print bcolors.OKGREEN + "[+] " + bcolors.ENDC + "URL parsing successful. URLs replaced."
    except:
        print bcolors.FAIL + "[-] URL parsing failed. Make sure the html file exists and is readable." + bcolors.ENDC
Beispiel #17
0
def getHomePage(homeurl):
    patt = re.compile(r"bbstcon,board,PPPerson,reid.*")
    htmlContent = htmlGraber.doGrab(homeurl)
    soup = BeautifulSoup(htmlContent)
    urlPages = soup.findAll("a", href=re.compile(patt))
    #   print "\n".join([bbsHome+str(item.attrs[0][1]) for item in urlPages]);
    return [bbsHome + str(item.attrs[0][1]) for item in urlPages]
Beispiel #18
0
def fixImageURL(strURL):
    # Provide user feedback
    print bcolors.OKGREEN + "[+] " + bcolors.ENDC + "Finding IMG tags with src=/... for replacement."
    print bcolors.OKGREEN + "[+] " + bcolors.ENDC + "RegEx matches:"
    # Open source, read lines, and begin parsing to replace all incomplete img src URLs
    try:
        # Print img src URLs that will be modified and provide info
        print "\n".join(re.findall('src="(.*?)"', open("source.html").read()))
        print bcolors.OKGREEN + "[+] " + bcolors.ENDC + "Fixing src with " + strURL + "..."
        with open("index.html", "r") as html:
            # Read in the source html and parse with BeautifulSoup
            soup = BeautifulSoup(html)
            # Find all <img> with src attribute and create a full URL to download and embed image(s)
            for img in soup.findAll("img"):
                imgurl = urlparse.urljoin(strURL, img["src"])
                image = urllib.urlopen(imgurl)
                # Encode in Base64 and embed
                img_64 = base64.b64encode(image.read())
                img["src"] = "data:image/png;base64," + img_64
            source = str(soup.prettify(encoding="utf-8"))
            # Write the updated addresses to source.html while removing the [' and ']
            output = open("index.html", "w")
            output.write(source.replace("[", "").replace("]", ""))
            output.close()
            print bcolors.OKGREEN + "[+] " + bcolors.ENDC + "IMG parsing successful. IMG src's fixed."
    except:
        # Exception may occur if file doesn't exist or can't be read/written to
        print bcolors.FAIL + "[-] IMG parsing failed. Make sure the html file exists and is readable." + bcolors.ENDC
    def _get_cleaned_text(self, html):
        """Extracts the text contained within the body of the Wikipedia article
		
		:param html: HTML source for the entire Wikipedia page
		:type html: str
		
		:return: a string containing all of the plaintext contained in the
			 	main content of the article
		:rtype: str
		"""
        soup = BeautifulSoup(html)
        main_content = soup.find("div", id="mw-content-text", recursive=True)  # Content for Wikipedia lives in this div

        for comment in main_content.findAll(text=lambda text: isinstance(text, Comment)):  # Extract all comments
            comment.extract()

        for headline in main_content.findAll("span", class_="mw-headline"):  # Remove boilerplate section names
            headline.extract()

        text = main_content.getText(separator=" ")  # Get all plaintext contained in the div

        text = re.sub(r"\[.*?\]", " ", text)  # Remove Wikipedia edit and reference text
        text = re.sub(r"http://.*?\s+", " ", text)  # Remove urls
        text = re.sub(r"\b(?:\d|\w)\b", " ", text)  # Remove single letters, numbers
        text = re.sub(r"\((.*?)\)", " \1 ", text)  # Remove parenthesis around numbers
        text = re.sub(r"\W+", " ", text)  # Remove non-alphanumeric characters
        text = re.sub(r"\s+", " ", text)  # Collapse whitespace
        return text
Beispiel #20
0
def getStopETA(sid):
    # ipshell = IPShellEmbed()
    query_url = seta_url + sid
    eta_pg = urllib2.urlopen(query_url).read()
    soup = BeautifulSoup(eta_pg)

    # Stuff is in id DIV ctl02_pnlMain
    pnlmain_div = (soup.findAll("div", attrs={"id": "ctl02_pnlMain"}))[0]
    time = pnlmain_div.findNext("div", "ttmobile_instruction").string
    station = pnlmain_div.findNext("div", "ttmobile_stationname").string
    eta_div = pnlmain_div.findNext("div", attrs={"id": "ttmobile_arrivalbuttons"})
    etas = eta_div.findAll("div")

    stop_etas = []
    curr_direction = ""
    for eta in etas:
        seta = {}
        if eta["class"].startswith("ttmobile_"):
            train = eta.findNext("span", "nexttraindesc")
            teta = eta.findNext("span", "nexttrainarrv")
            trainstr = train.string.strip()
            tr_col, sep, tr_dest = trainstr.partition("&gt;")
            # print " % % " % tr_col.strip(), tr_dest.strip()
            # ipshell()
            seta["color"] = tr_col.strip()
            seta["dest"] = tr_dest.strip()
            # ipshell()
            eta_time = (teta.findNext("b")).string
            eta_time_acc = teta.contents[2].strip()
            seta["eta"] = eta_time + " " + eta_time_acc
            stop_etas.append(seta)
        else:
            continue

    return stop_etas
def pres_primary_dem_county():
    with open("20080126__sc__democratic__primary__president__county.csv", "wb") as csvfile:
        w = unicodecsv.writer(csvfile, encoding="utf-8")
        url = "http://www.state.sc.us/cgi-bin/scsec/r108dpf"
        r = requests.post(url, data={"race": "PRESIDENT", "election": "pri08dpf", "prr": "dp"})
        soup = BeautifulSoup(r.text)
        table = soup.find("table")
        rows = table.findAll("tr")[1:]
        first_names = [x.text for x in rows[0].findAll("td") if x.text != ""]
        last_names = [x.text for x in rows[1].findAll("td") if x.text != ""][2:]
        candidates = [" ".join(x) for x in zip(first_names, last_names)]
        headers = ["county", "office", "district", "party", "candidate", "votes"]
        w.writerow(headers)
        for row in rows[2:]:
            for candidate in candidates:
                w.writerow(
                    [
                        row.findAll("td")[0].text,
                        "President",
                        None,
                        "DEM",
                        candidate,
                        row.findAll("td")[candidates.index(candidate) + 2].text,
                    ]
                )
Beispiel #22
0
    def _get_task_list(self, pagenum, st):
        r = self.session.get(self.task_url + "&st=" + str(st), cookies=dict(pagenum=str(pagenum)))
        if r.error:
            r.raise_for_status()
        soup = BeautifulSoup(r.content)
        gdriveid_input = soup.find("input", attrs={"id": "cok", "type": "hidden"})
        self.gdriveid = gdriveid_input.attrMap["value"]

        result = []
        for task in soup.findAll("div", **{"class": "rw_list"}):
            tmp = dict()
            for each in task.findAll("input"):
                input_id = each.get("id", "")
                if not input_id:
                    continue
                input_attr = input_id.rstrip("1234567890")
                input_value = each.get("value", "")
                tmp[input_attr] = input_value
            assert tmp["input"]
            process = task.find("em", **{"class": "loadnum"})
            assert process.string
            tmp["process"] = float(process.string.rstrip("%"))
            result.append(tmp)
        DEBUG(pformat(result))
        return result
Beispiel #23
0
def use_BS(url, content):

    return []  # XXX this func is broken

    urls = []
    try:
        from BeautifulSoup import BeautifulSoup
    except:
        return urls
    from urlparse import urlparse

    u = urlparse(url)
    urlprefix = "http://%s" % u.hostname  # note this is possibly
    # broken since it
    # doesn't handle port, etc.  whops
    urlpath = u.path

    bs = BeautifulSoup(content)
    for a in bs.findAll("a"):
        if not a["href"].startswith("http:"):
            if a["href"].startswith("/"):  # non-relative url

                urls.append("%s%s" % (urlprefix, a["href"]))
            else:  # relative url
                urls.append("%s%s/%s" % (urlprefix, urlpath, a["href"]))
        else:  # technically, these should have been picked up already, but the
            # set lets us add them again
            urls.append(a["href"])
    return urls
Beispiel #24
0
def main():
    nogood = 1
    x = 0
    while nogood == 1:
        bitly = "http://goo.gl/WNOecx"
        time.sleep(random.randint(12, 15))
        r = requests.get(bitly)
        soup = BeautifulSoup(r.text)
        y = soup.find(id="shelfDiv").find(id="border")
        new = str(y)
        newfile = open("new.txt", "w+")
        newfile.write(new)
        newfile.close()
        reopen = open("new.txt", "w+")
        new = reopen.read()
        reopen.close()
        f = open("baseline.txt", "w+")
        old = f.read()
        if new == old:
            print "round:  " + str(x)
            x += 1
        else:
            server = smtplib.SMTP("smtp.gmail.com", 587)
            server.starttls()
            server.login("openhaijaz@gmail.com", "ektelo9n")
            server.sendmail("me!!", "7032258785@messaging.sprintpcs.com", bitly)
            print "noooooooooo"
            f.write(new)
            nogood = 0
            break
        f.close()
	def get(self, regno):
		#self.response.headers['Content-Type'] = 'text/html'
		br= _mechanize.Browser()
		cj = cookielib.CookieJar()
		br.set_cookiejar(cj)
		br.set_handle_equiv(True)
		br.set_handle_redirect(True)
		br.set_handle_referer(True)
		br.set_handle_robots(False)
		n=262
		while(n<=262):
			m=str(n).zfill(4) # filling zeros for roll no like 001,002 etc.
			n=n+1
			#self.response.write('11BEC') # This is where roll no goes, for 09BCE just replace by 09BCE.
			#u=regno
			r=br.open('https://academics.vit.ac.in/parent/parent_login.asp')
			html=r.read()
			soup=BeautifulSoup(html)
			img = soup.find('img', id='imgCaptcha')
			image_response = br.open_novisit(img['src'])
			captcha = Captcha()
			#captcha.cookie = "123456788sids"
			#captcha.image = db.Blob(image_response.read())
			captcha.regno = regno
			for cook in cj:
                                                                captcha.cookie = cook.value
                                                                captcha.cookiename = cook.name
																
			captcha.put()
			self.response.headers['Content-Type'] = 'image/jpeg'
			self.response.out.write(image_response.read())
Beispiel #26
0
def test_module1(html=None):

    # runs tests on the faux html generated with buildTestHtml (above).
    html = buildTestHtml()
    searchDepth = 5
    soup = BeautifulSoup(html)
    findObj = FindDateInSoup(searchDepth)

    pobj = soup.findAll(text=re.compile("level 1.0.2.2"))[
        0
    ]  # test1 = sibling has date should return date from sibing object
    dateReturned = findObj.findDate_main(pobj)
    assert dateReturned == datetime(2010, 1, 1)  # "January 1, 2010"

    pobj = soup.findAll(text=re.compile("level 1.0.1"))[0]  # test2: sib has no date, but sib's kids do
    dateReturned = findObj.findDate_main(pobj)
    assert dateReturned == datetime(2010, 1, 1)  # "January 1, 2010"

    pobj = soup.findAll(text=re.compile("level 2.1.1.1"))[0]  # test3: great-grandparent has the date
    dateReturned = findObj.findDate_main(pobj)
    assert dateReturned == datetime(2010, 5, 1)  # "May 1, 2010"

    pobj = soup.findAll(text=re.compile("level 3.1.1.1"))[0]  # test4: a distant relative has the date
    dateReturned = findObj.findDate_main(pobj)
    assert dateReturned == datetime(2010, 6, 1)  # "June 1, 2010"
Beispiel #27
0
    def test_date_dateparts_form(self):
        """
        Form with a date
        """
        schema = schemaish.Structure()
        schema.add("a", schemaish.Date())

        form_name = "form_name"
        form = formish.Form(schema, form_name)
        form["a"].widget = formish.DateParts()

        request = self.Request(form_name, {"a.year": "", "a.month": "", "a.day": ""})
        data = form.validate(request)
        assert data == {"a": None}

        request_data = {"a.day": "18", "a.month": "12", "a.year": "1966"}
        expected_data = {"a": date(1966, 12, 18)}

        request = self.Request(form_name, request_data)
        data = form.validate(request)
        assert data == expected_data

        form.defaults = expected_data
        htmlsoup = BeautifulSoup(form())
        assert htmlsoup.findAll(id="form_name-a-field")[0]["class"] == "field date dateparts"
        assert htmlsoup.findAll(id="form_name-a")[0]["value"] == "18"
Beispiel #28
0
def parsing_page(url):
    print url
    page = urllib2.urlopen(url)
    soup = BeautifulSoup(page)
    for embed in soup.findAll("embed"):
        swf_url = dict(embed.attrs)["src"]
        print swf_url
Beispiel #29
0
    def downloadComic(self):
        br = self.getBrowser()

        comic_page_url = self.getComicPageURL()

        try:
            comic_page_request = br.open(comic_page_url)
        except:
            return False

        html = comic_page_request.read()

        soup = BeautifulSoup(html)

        img = soup.find("img", attrs={"class": "img-responsive img-comic"})

        if img:
            img_url = img.get("src", "")

            # Get the image data and extension
            (img_data, extension) = self.downloadImage(br, img_url)

            filename = "%s-%s.%s" % (self.strip, self.datestamp, extension)

            # Save the comic to the filesystem
            self.writeComic(filename, img_data)

            return True  # Causes sleep on successful download
Beispiel #30
0
    def _product_urls_and_types(cls, product_types):
        browser = mechanize.Browser()

        url_extensions = [
            ["38-notebooks", "Notebook"],
            ["7-monitores-y-proyectores", "Monitor"],
            ["41-placas-madre", "Motherboard"],
            ["44-procesadores", "Processor"],
            ["54-tarjetas-de-video", "VideoCard"],
            ["28-memoria-ram", "Ram"],
            ["5-discos-duros", "StorageDrive"],
            ["19-fuentes-de-poder", "PowerSupply"],
            ["16-gabinetes", "ComputerCase"],
        ]

        product_links = []
        for url_extension, ptype in url_extensions:
            if ptype not in product_types:
                continue
            url = "http://www.mybox.cl/" + url_extension + "?n=50"

            soup = BeautifulSoup(browser.open(url).get_data())
            prod_list = soup.find("ul", {"id": "product_list"})

            if not prod_list:
                continue

            prod_cells = prod_list.findAll("li")

            for cell in prod_cells:
                product_links.append([cell.find("a")["href"], ptype])

        return product_links