Example #1
0
 def test_product_card_location(self):
     response = self.client.get("/datadocument/179486/")
     html = response.content.decode("utf-8")
     e_idx = html.index('id="extracted-text-title"')
     p_idx = html.index('id="product-title"')
     self.assertTrue(p_idx < e_idx, ("Product card should come before "
                                     "Extracted Text card"))
Example #2
0
 def test_product_card_location(self):
     response = self.client.get('/datadocument/179486/')
     html = response.content.decode('utf-8')
     e_idx = html.index('<h4>Extracted Text</h4>')
     p_idx = html.index('<h4 class="d-inline">Products</h4>')
     self.assertTrue(p_idx > e_idx, ('Product card should come after '
                                     'Extracted Text card'))
Example #3
0
 def block_html(self, html):
     if self.texoid and html.startswith('<latex'):
         attr = html[6:html.index('>')]
         latex = html[html.index('>') + 1:html.rindex('<')]
         latex = self.parser.unescape(latex)
         result = self.texoid.get_result(latex)
         if not result:
             return '<pre>%s</pre>' % mistune.escape(latex, smart_amp=False)
         elif 'error' not in result:
             img = ('''<img src="%(svg)s" onerror="this.src='%(png)s';this.onerror=null"'''
                    'width="%(width)s" height="%(height)s"%(tail)s>') % {
                       'svg': result['svg'], 'png': result['png'],
                       'width': result['meta']['width'], 'height': result['meta']['height'],
                       'tail': ' /' if self.options.get('use_xhtml') else ''
                   }
             style = ['max-width: 100%',
                      'height: %s' % result['meta']['height'],
                      'max-height: %s' % result['meta']['height'],
                      'width: %s' % result['meta']['height']]
             if 'inline' in attr:
                 tag = 'span'
             else:
                 tag = 'div'
                 style += ['text-align: center']
             return '<%s style="%s">%s</%s>' % (tag, ';'.join(style), img, tag)
         else:
             return '<pre>%s</pre>' % mistune.escape(result['error'], smart_amp=False)
     return super(AwesomeRenderer, self).block_html(html)
Example #4
0
def find_link(html, start_index):
    pivot_index = html.index(Config.AMAZON_SEARCH_PIVOT, start_index)
    href = "href=\""
    link_start = html.index(href, pivot_index) + len(href)
    link_end = html.index("\"", link_start)
    link = html[link_start:link_end]
    result = LinkResult(link, link_end)
    return result
Example #5
0
def getCalories(html):
    try:
        string1 = "Calories&nbsp;"
        string2 = "</b>"
        start = html.index(string1) + len(string1)
        end = html.index(string2,start)
        return int(html[start:end])
    except:
        return -1
Example #6
0
def getCalories(html):
    try:
        string1 = "Calories&nbsp;"
        string2 = "</b>"
        start = html.index(string1) + len(string1)
        end = html.index(string2, start)
        return int(html[start:end])
    except:
        return -1
Example #7
0
 def get_data(self, url):
     req = requests.get(url=url, headers=self.headers)
     req.encoding = 'utf-8'
     html = req.text
     html = html[html.index("(") + 1:]
     html = html[:html.index(")"):]
     try:
         product_dic = demjson.decode(html)
         return product_dic
     except Exception:
         return ""
Example #8
0
def is_amazon_seller(html):
    seller_index = html.index(Config.AMAZON_OTHERSELLERS_SELLERINFO_PIVOT)
    seller_end = html.index(Config.AMAZON_OTHERSELLERS_SELLERINFO_END_PIVOT,
                            seller_index)
    try:
        amazon_index = html.index(Config.AMAZON_OTHERSELLERS_PROPRIETARY_TAG,
                                  seller_index)
        if amazon_index < seller_end:
            return True
        else:
            return False
    except:
        return False
Example #9
0
 def test_ingredient_rank(self):
     doc = DataDocument.objects.get(pk=254643)
     qs = doc.extractedtext.rawchem.select_subclasses()
     one = qs.first()
     two = qs.last()
     self.assertTrue(two.ingredient_rank > one.ingredient_rank)
     response = self.client.get(f"/datadocument/{doc.pk}/")
     html = response.content.decode("utf-8")
     first_idx = html.index(f'id="chem-{one.pk}"')
     second_idx = html.index(f'id="chem-{two.pk}"')
     self.assertTrue(
         second_idx > first_idx,
         ("Ingredient rank 1 comes before "
          "Ingredient rank 2"),
     )
def scrapeGame(url):
    html = scraperwiki.scrape(url+"/games?xml=1")
    root = lxml.html.fromstring(html)
    sindex = html.index("gamesList")+10
    eindex = html.rindex("gamesList")
    bodycontent = html[sindex:eindex]
    sindex = bodycontent.index("games")+6
    eindex = bodycontent.rindex("games")-2
    rows = bodycontent[sindex:eindex]
    start = rows.index("<game>")
    end = rows.index("</game>")
    glist = rows.split("<game>")
    gamestats = []
    for g in glist:
        start = g.find("<name>")
        end = g.find("</name>")
        start2 = g.find("<hoursOnRecord>")
        end2 = g.find("</hoursOnRecord>")
        if start!=-1 and end!=-1:
            name = g[start+6:end]
            game = name[9:len(name)-3]
            hours = 0.0
            if start2==-1 or end2==-1:
                hours = 0.0
            else:
                hours = float(g[start2+15:end2])
            stat = {"game":game,"hours":hours}
            gamestats.append(stat)
    return gamestats
Example #11
0
def get_bestseller_rank(resp):
    html = resp.content
    if "Robot Check" in html:
        raise EnvironmentError("CAPTCHA :(")
    pivot = html.index(Config.AMAZON_SELLERRANK_PIVOT)
    numb_start = html.rfind("#", 0, pivot) + 1
    numb_end = html.find(" ", numb_start)
    rank = html[numb_start:numb_end]
    return rank
Example #12
0
def get_othersellers_lowest_prices(resp):

    html = resp.content
    if "Robot Check" in html:
        raise EnvironmentError("CAPTCHA :(")

    if is_amazon_seller(html):
        return [-1, -1, -1]

    prices = [0, 0, 0]  #main, shipping, tax

    offer_start = html.index("a-spacing-mini olpOffer")
    offer_end = html.index("</p>", offer_start)

    try:
        mainprice_index = html.index(
            Config.AMAZON_OTHERSELLERS_OFFERPRICE_PIVOT, offer_start)
        if mainprice_index > offer_end:
            print("mainprice_index wastoo big.")
            return prices
        prices[0] = get_price(html, mainprice_index)
    except Exception as e:
        print("Error in mainprice_index: " + e.message)

    try:
        shippingprice_index = html.index(
            Config.AMAZON_OTHERSELLERS_SHIPPINGPRICE_PIVOT, offer_start)
        if shippingprice_index < offer_end:
            prices[1] = get_price(html, shippingprice_index)
    except Exception as e:
        print("Error in shippingprice_index: " + e.message)

    try:
        taxprice_index = html.index(Config.AMAZON_OTHERSELLERS_TAXPRICE_PIVOT,
                                    offer_start)
        if taxprice_index < offer_end:
            prices[2] = get_price(html, taxprice_index)
    except Exception as e:
        print("Error in taxprice_index: " + e.message)

    return prices
Example #13
0
def get_main_table_from_file(f, year):
    """Return the ``mainTable`` table from file ``f`` for year ``year`` while
        taking care of all the obfuscation and garbage.
    """

    if year <= 2011:
        if year <= 2009:
            html = f.read()
        else: # Siveco's HTML inside HTML
            for line in f:
                html = get_inner_html(line)
                if html:
                    break
            else:
                return None
        html = html[html.index('<HTML>'):] # remove leading garbage
    else: # finally a plain HTML
        html = f.read()
    return get_main_table(html, year)
Example #14
0
def get_blue_peter_content (url):
    try:
        today = datetime.date.today()

        html = urlopen(url).read()
        btn = html.index ("btnlounaslista.gif")
        end = html[:btn].rindex ("pdf") + 3
        start = html[:end].rindex ("http://")
        pdf_url = html[start:end]

        pdf_raw_data = urlopen(pdf_url).read()

        pdf = pyPdf.PdfFileReader(io.BytesIO(pdf_raw_data))
        pdf_text = pdf.pages[0].extractText()
        pdf_text = re.sub("Liikelounasmenu ", '', pdf_text)

        delimiters = "MAANANTAI|TIISTAI|KESKIVIIKKO|TORSTAI|PERJANTAI|VL ="
        return [re.split(delimiters, pdf_text)[datetime.date.weekday(today) + 1]]
    except ValueError:
        print "Failed to find Blue peter pdf";
        return []
Example #15
0
 def connect(self):
     """ connect to onleihe website emulating webbrowser - cookies added to requests session """
     response = self.session.get(STARTURL)
     LOGGER.debug("got cookies: %s" %
                  self.session.cookies)  # a RequestsCookieJar
     LOGGER.debug("JSESSIONID='%s'" % (response.cookies.get('JSESSIONID')))
     assert self.session.cookies.get('JSESSIONID') == response.cookies.get(
         'JSESSIONID')
     html = response.content.decode('utf-8')
     #if not '<title>die OnleiheRegio. Startseite</title>' in html:
     if not 'title="die OnleiheRegio"' in html or 'An unexpected error has occurred!' in html:
         open('onliehe_start_bad.html', 'w').write(html)
         raise ValueError("unexpected response from onleihe url=%s." %
                          STARTURL)
     # <section id="simple-search">
     # <h3>Einfache Suche</h3>
     if html.index('<section id="simple-search">') < 0:
         open('onliehe_missing_search.html', 'w').write(html)
         raise ValueError("missing search on onleihe page url=%s." %
                          STARTURL)
     return
Example #16
0
def get_blue_peter_content(url):
    try:
        today = datetime.date.today()

        html = urlopen(url).read()
        btn = html.index("btnlounaslista.gif")
        end = html[:btn].rindex("pdf") + 3
        start = html[:end].rindex("http://")
        pdf_url = html[start:end]

        pdf_raw_data = urlopen(pdf_url).read()

        pdf = pyPdf.PdfFileReader(io.BytesIO(pdf_raw_data))
        pdf_text = pdf.pages[0].extractText()
        pdf_text = re.sub("Liikelounasmenu ", '', pdf_text)

        delimiters = "MAANANTAI|TIISTAI|KESKIVIIKKO|TORSTAI|PERJANTAI|VL ="
        return [
            re.split(delimiters, pdf_text)[datetime.date.weekday(today) + 1]
        ]
    except ValueError:
        print "Failed to find Blue peter pdf"
        return []
Example #17
0
i = 0;
for el in root.cssselect("span.surveyNumber"): 
    if el.text[len(el.text) - 1] == "%":
        if i == 0:
            bullish = el.text.strip().replace("%", "")
        if i == 1:
            neutral = el.text.strip().replace("%", "")
        if i == 2:
            bearish = el.text.strip().replace("%", "")
        i = i + 1

print bullish
print neutral 
print bearish 

index = html.index("Week ending")
print index 
date = html[index + 11:index + 21].strip()
print date 

date_object = datetime.strptime(date, '%m/%d/%Y')

scraperwiki.sqlite.save(unique_keys=["survey_date", "mood"], data={"survey_date":date_object, "mood":"bullish", "survey_percentage":bullish})
scraperwiki.sqlite.save(unique_keys=["survey_date", "mood"], data={"survey_date":date_object, "mood":"neutral", "survey_percentage":neutral})
scraperwiki.sqlite.save(unique_keys=["survey_date", "mood"], data={"survey_date":date_object, "mood":"bearish", "survey_percentage":bearish})

import scraperwiki
import lxml.html
import datetime
from datetime import datetime  
Example #18
0
def get_price(html, pivot_index):
    dollar_index = html.index("$", pivot_index)
    ending_index = html.index("</span>", pivot_index)
    if dollar_index > ending_index:
        return 0
    return parse_price(html, dollar_index)
Example #19
0
def get_mainTable(html):
    html = html[html.index('<HTML>'):]
    html = unicode(html, 'utf-8')
    doc = lxml.html.fromstring(html)
    main_table = doc.xpath(r'''//table[@id="mainTable"]''')[0]
    return main_table