コード例 #1
0
ファイル: filters.py プロジェクト: AloneRoad/jupo
def remove_empty_lines(html):
  key = '%s:remove_empty_lines' % hash(html)
  out = cache.get(key, namespace="filters")
  if out:
    return out
  
  if '</' in html:
    html = html.strip().replace('\n', '')
    soup = BeautifulSoup(html)
    lines = []
    for element in soup.contents:
      if isinstance(element, Tag):
        if element.text:
          lines.append(str(element).strip())
        elif 'br' in str(element):
          lines.append('\n')
      elif isinstance(element, NavigableString):
        lines.append(str(element).strip())
    out = ''.join(lines).strip()
    while '\n\n' in out:
      out = out.replace('\n\n', '\n')
  else:
    out = '\n'.join([line for line in html.split('\n') if line.strip()])
  cache.set(key, out, namespace="filters")
  return out
コード例 #2
0
ファイル: utils.py プロジェクト: qwang2505/transcode
 def is_hidden_node(cls, node):
     """ Check if a node is hidden in html page
     """
     style_list = node.get("style", None)
     if style_list:
         for p in style_list.split(";"):
             tokens = p.split(":")
             if len(tokens) >= 2 and tokens[0].strip().lower() == "display" and tokens[1].strip().lower() == "none":
                 return True
     return False
コード例 #3
0
ファイル: utils.py プロジェクト: qwang2505/transcode
 def shrink_style(cls, style_str, filtered_css_properties, changed_css_properties):
     if not style_str:
         return None
     properties = {}
     for p in style_str.split(";"):
         if p.strip():
             token = p.split(":")
             if len(token) > 1:
                 properties[token[0].strip()] = token[1].strip()
     return Utils._shrink_properties(properties, filtered_css_properties, changed_css_properties)
コード例 #4
0
def fields_from_split_html(template, html, separator, regex_with_groups_named_as_keys):
    list_to_ret = []
    lines = html.split(separator)
    for line in lines: 
        m = re.match(regex_with_groups_named_as_keys, line)
        if hasattr(m, 'groupdict'): 
            dict_to_ret = dict(template.items() + m.groupdict().items())
            list_to_ret.append(dict_to_ret)
    
    return list_to_ret
コード例 #5
0
    def scrape(self, chamber, term_name):

        for t in self.metadata['terms']:
            if t['name'] == term_name:
                session = t['sessions'][-1]
                slug = self.metadata['session_details'][session]['slug']

        if chamber == 'upper':
            chamber_slug = 'Senate'
        elif chamber == 'lower':
            chamber_slug = 'Assembly'

        leg_base_url = 'http://www.leg.state.nv.us/App/Legislator/A/%s/%s/' % (chamber_slug, slug)
        leg_json_url = 'http://www.leg.state.nv.us/App/Legislator/A/api/%s/Legislator?house=%s' % (slug, chamber_slug)

        resp = json.loads(self.get(leg_json_url).text)

        for item in resp:
            # empty district
            empty_names = ['District No', 'Vacant']
            if any(name in item['FullName'] for name in empty_names):
                continue
            last, first = item['FullName'].split(",", 1)
            item['FullName'] = "{first} {last}".format(last=last,
                                                       first=first).strip()
            leg = Legislator(term_name, chamber, item['DistrictNbr'],
                             item['FullName'], party=item['Party'],
                             photo_url=item['PhotoURL'])
            leg_url = leg_base_url + item['DistrictNbr']

            # hack to get the legislator ID
            html = self.get(leg_url).text
            for l in html.split('\n'):
                if 'GetLegislatorDetails' in l:
                    leg_id = l.split(',')[1].split("'")[1]

            # fetch the json used by the page
            leg_details_url = 'https://www.leg.state.nv.us/App/Legislator/A/api/78th2015/Legislator?id=' + leg_id
            leg_resp = json.loads(self.get(leg_details_url).text)
            details = leg_resp['legislatorDetails']

            address = details['Address1']
            address2 = details['Address2']
            if address2:
                address += ' ' + address2
            address += '\n%s, NV %s' % (details['City'], details['Zip'])

            phone = details['LCBPhone']
            email = details['LCBEmail']

            leg.add_office('district', 'District Address', address=address,
                                   phone=phone,email=email)
            leg.add_source(leg_details_url)
            self.save_legislator(leg)
コード例 #6
0
def html_clean(html):
    import lxml.html.clean
    import lxml.html
    import lxml.etree

    html, errors = tidy_document(html,
                                 # Tidy options: http://tidy.sourceforge.net/docs/quickref.html
                                 options={'bare': 1, 'clean': 1, 'output-xhtml': 1,
                                          'drop-font-tags': 1, 'drop-proprietary-attributes': 1,
                                          'hide-comments': 1,
                                          'char-encoding': 'utf8', 'input-encoding': 'utf8', 'output-encoding': 'utf8'})
    cleaner = lxml.html.clean.Cleaner(
        kill_tags=frozenset(['script', 'style', 'option']),
        remove_tags=frozenset(['a', 'strong', 'em']),
        safe_attrs_only=True, safe_attrs=frozenset())
    html = cleaner.clean_html(html)

    # html = lxml.etree.tostring(lxml.html.fromstring(html), pretty_print=True).decode('utf8')

    # html = html.encode('utf-8', errors='strict')
    soup = BeautifulSoup(html)
    # [s.extract() for s in soup('script')]  # remove 'script', 'style', 'option' tags
    # [s.extract() for s in soup('style')]
    # [s.extract() for s in soup('option')]
    html = soup.prettify()

    # html = htmllaundry.strip_markup(html)  # leave only text

    # remove continuous empty lines
    html = re.sub(r'\n\s*\n+', '\n\n', html).strip()
    html = re.sub(r'[ \t]+', ' ', html, re.M).strip()  # remove continuous spaces

    # cleaned_html = [sent for sent in cleaned_html.split(
    # '\n')]  # if len(sent.split()) == 0 or len(sent.split()) >= 6]
    html_lines = html.split('\n')
    # return html_lines
    return list(html_sent_word_tokenize(html_lines))
コード例 #7
0
        url = link.get('href')
        if url is not None and 'incident-reports' in url and 'read more' not in link.text:
            reportUrls.append('http://www.wrps.on.ca' + url)
        elif url is not None and 'next' in link.text:
            listUrls.append('http://www.wrps.on.ca' + url)

g = geocoders.Google(domain='maps.google.ca')

#scraperwiki.sqlite.execute('drop table swdata')
#scraperwiki.sqlite.commit()

reportUrls.reverse()
for report in reportUrls:
    print report
    html = scraperwiki.scrape(report)
    lines = html.split('\n')
    itype = None
    for line in lines:
        match = re.search('<h1 class="title">.*eports( for)? (.*?)( ?- ?[Uu][Pp][Dd][Aa][Tt][Ee].*)?</h1>', line)
        if match:
            try:
                reportdate = datetime.strptime(match.group(2), '%B %d, %Y')
            except:
                reportdate = datetime.strptime(match.group(2), '%B%d, %Y')
            continue
        match = re.search('^<p>.*?Incident # ([0-9]{2}-[0-9]{6}).*?Type : ([^&]*)&?.*?<br />(.*?)<br />(.*?)</p>', line.strip())
        if match:
            if itype != None:
                processIncident(incident, itype, location, text, reportdate)
                #
    
コード例 #8
0
ファイル: content_extract.py プロジェクト: liuzl/nlp4econ
def get_main_content(html):
    if not isinstance(html, unicode):
        return '',''

    html_lines_len = [len(x.strip()) for x in html.split('\n')]

    # 保存图片信息
    images = {}
    for img in re.findall(RE_IMG, html):
        md5 = hashlib.md5(img.encode('utf-8','ignore')).hexdigest()[:16]
        html = html.replace(img, md5)
        r = re.findall(RE_IMG_SRC, img)
        if len(r) == 1: src = r[0][1]
        else: src = ''
        images[md5] = "<img src='%s'>" % src#img

    # 去除所有的html标签
    text = re.sub(RE_TAG, '', html)

    # 抽取发表时间
    time = ''
    t = re.findall(RE_DATETIME, text)
    if len(t) > 0:
        time = t[0][0]

    lines = [x.strip() if is_useful_line(x) else '' for x in text.split('\n')]
    index_dist = []
    size = len(lines)
    for i in xrange(size - BLOCKS_WIDTH + 1):
        char_num = 0
        for j in xrange(i, i + BLOCKS_WIDTH):
            strip = re.sub(ur'\s+', '', lines[j])
            char_num += len(strip)
        index_dist.append(char_num)
    main_text = ''
    fstart = -1
    start = -1
    end = -1
    flag_s = False
    flag_e = False
    first_match = True
    for i in xrange(len(index_dist) - 1):
        if first_match and not flag_s:
            if index_dist[i] > THRESHOLD / 2:
                if index_dist[i+1] != 0 or index_dist[i+2] != 0:
                    first_match = False
                    flag_s = True
                    start = i
                    fstart = i
                    continue
        if index_dist[i] > THRESHOLD and not flag_s:
            if index_dist[i+1] != 0 or index_dist[i+2] != 0 or index_dist[i+3] != 0:
                flag_s = True
                start = i
                continue
        if flag_s:
            if index_dist[i] == 0 or index_dist[i+1] == 0:
                end = i
                flag_e = True
        tmp = ''
        if flag_e:
            for ii in xrange(start, end+1):
                if (len(lines[ii]) < 1): continue
                tmp += lines[ii] + '\n'
            main_text += tmp
            flag_s = flag_e = False

#    for pre in xrange(fstart - 1, max(0, fstart - BLOCKS_WIDTH), -1):
#        for md5 in images.keys():
#            if lines[pre].find(md5) > 0:
#                main_text = lines[pre] + '\n' + main_text
#                break

    for md5,img in images.iteritems():
        main_text = main_text.replace(md5, img)
    return strtotime(time), main_text
コード例 #9
0
#print "ok handle"
# Want debugging messages?
br.set_debug_http(True)
br.set_debug_redirects(True)
br.set_debug_responses(True)
    
# User-Agent 
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
#print "ok headers"
the_list = []

list_url = "http://greencracker.net/wp-content/uploads/2013/06/elevenn.csv"
response = br.open(list_url)
html = response.read()

the_list = html.split("*")
i = 0
for item in range(len(the_list)):
    the_list[i] = the_list[i].replace("\r", "")
    the_list[i] = the_list[i].replace("'", "")
    the_list[i] = the_list[i].replace('"', "")
    the_list[i] = the_list[i].strip()
    i=i+1

print the_list


target = "http://www.nsopw.gov/en-us/Search"
response = br.open(target)
html = response.read()
print html
コード例 #10
0
tree = html.fromstring(page.text)

#This will create a list of prices
reflection_html = tree.xpath('//div[@class="moduleBody"]')[0]
html = etree.tostring(reflection_html, pretty_print=True)

html = html.replace('\r\n', '\n')
html = html.replace('<br/>', '\n').replace('&#160;', ' ').replace('\n \n', '\n\n')

new_html = ""
for line in html:
    new_html += line.strip()
    new_html += "\r"
#html = new_html

parts = html.split("<h1> </h1>")


#with open("reflection.html", 'w') as out:
#    out.write(html.encode('utf8'))
#print "{} parts.".format(len(parts))

with open('output.markdown', 'w') as out_all:
    for count, part in enumerate(parts):
        markdown = converter.convert(html, 'markdown', format='html')
        out_all.write(part.encode('utf8'))
        if count+1 < len(parts):
            out_all.write("\n\\pagebreak\n")
        with open('output{}.markdown'.format(count), 'w') as out:
            out.write(part.encode('utf8'))