def extract_comments(html):
    tree = lxml.html.fromstring(html)
    item_sel = cs('.comment-item')
    text_sel = cs('.comment-text-content')
    time_sel = cs('.time')
    author_sel = cs('.user-name')

    for item in item_sel(tree):
        yield {'cid': item.get('data-cid'),
               'text': text_sel(item)[0].text_content(),
               'time': time_sel(item)[0].text_content().strip(),
               'author': author_sel(item)[0].text_content(),
               'tag': get_comment_sentiment(text_sel(item)[0].text_content())}
Beispiel #2
0
def get_scores(text):
    doc = document_fromstring(text)
    trs = cs("table table table[bgcolor=\"#666666\"] tr")(doc)
    trs = [tr for tr in trs if tr.text_content()]

    ret = {}
    group = None

    for tr in trs:
        if group is None:
            group = strip(tr.text_content().strip())
            ret[group] = {}
            continue
        if not ret[group]:
            ret[group]["updated"] = strip(tr.text_content())
        elif len(tr) == 4:
            ret[group].setdefault("scores", []).append(
                {
                    "time": strip(tr[0].text_content()),
                    "home": strip(tr[1].text_content()),
                    "away": strip(tr[3].text_content()),
                    "score": strip(tr[2].text_content()),
                })
        else:
            group = strip(tr.text_content())
            ret[group] = {}
    return ret
Beispiel #3
0
def text(selector, html):
    res = cs(selector)(html)
    if not res:
        return ""
    if res and len(res) == 1:
        return res[0].text_content().strip()
    res = map(lambda x: x.text_content().strip(), res)
    return "".join(res)
Beispiel #4
0
def extract_title(fragment):
    if not has_lxml: return ""
    doc = document_fromstring(fragment)
    try:
        return cs('h1')(doc)[0].text_content()
    except:
        import traceback
        traceback.print_exc()
        return ""
Beispiel #5
0
def summarize(content, url=""):
    """Return a summary for an html document.  If a URL is passed, it may be
    treated specially to give better results, eg. twitter will return the tweet."""
    html = document_fromstring(content)
    if url:
        parsed = urlparse.urlparse(url)
        if parsed.netloc.endswith("twitter.com") and "status" in url:
            tweet = text(".permalink-tweet .tweet-text", html)
            try:
                username = cs(".permalink-tweet")(html)[0].attrib["data-screen-name"]
                return "@%s: %s" % (username, tweet)
            except:
                return tweet
    # try to return opengraph description or title first, then just the <title>
    ogdesc = first("meta[property=\"og:description\"]", html)
    if ogdesc:
        return utils.maxlen(ogdesc.attrib["content"])

    ogtitle = first("meta[property=\"og:title\"]", html)
    if ogtitle:
        return utils.maxlen(ogtitle.attrib["content"])

    return text("title", html)
def extract_reply_cids(html):
    tree = lxml.html.fromstring(html)
    sel = cs('.comment-replies-header > .load-comments')
    return [i.get('data-cid') for i in sel(tree)]
Beispiel #7
0
from django.core.paginator import Paginator, InvalidPage, EmptyPage
from django.db.models import Q
from django.http import Http404, HttpResponseRedirect, HttpResponse
from django.shortcuts import render_to_response
from django.template import RequestContext
from laws import models 
from laws.models import SectionFile, SearchForm
from utils.searchtext import searchtext_sphinx, searchtext_FTS4
from utils.utils import *
from operator import itemgetter
from lxml import html, etree
from lxml.cssselect import CSSSelector as cs
import settings
            
bodysel = cs('body')

def target_remove(request):
        current_url = request.get_full_path()
        new_url = current_url.replace('target/','')
        return HttpResponseRedirect(new_url)

@render_to("code_display.html")
def target_to_section(request, codename, target_section):
        if codename == 'this':
            current_url = request.META['HTTP_REFERER']            
            codename = current_url.split('-')[1]
            #For Table of Contents, there is a trailing / that needs to be removed
            codename = codename.strip('/')
            print request.get_full_path()
            #Hack to ensure there is one, and only one './' at the end of the url
Beispiel #8
0
Datei: main.py Projekt: aih/calaw
# Python
# run popCode() then main()

import os, sys, re, subprocess, fnmatch
import pickle
from subprocess import PIPE, Popen
from lxml import html, etree
from lxml.cssselect import CSSSelector as cs
from laws import models

# Settings
regfile = "./utils/sectionlist.txt"

# Folder that holds the legislation files
pathin = "./media/cacodegit/"
divisions = cs("div")


def unix_find(pathin):
    """Return results similar to the Unix find command run without options
    i.e. traverse a directory tree and return all the file paths
    """
    for root, dirs, files in os.walk(pathin):
        for filename in [filename for filename in files if not fnmatch.fnmatch(filename, ".*")]:
            yield os.path.join(root, filename)


# Populate the Code table
codedictpath = "./utils/codedict"
codedict = pickle.load(open(codedictpath, "rb"))
Beispiel #9
0
codedictpath = '/Users/tabulaw/Documents/workspace/calaw/codedict'
codedict = pickle.load(open(codedictpath,'rb'))
def popCode():
    for codeabbr in codedict:
        saveCode(codeabbr)

def saveCode(codeabbr):
    code_current = models.Code(
        name = codeabbr,
        fullname = codedict[codeabbr],
        url = '/laws/target/'+codeabbr+'/'
    )   
    code_current.save()

# Parse sections and save each to the db
divsel = cs('div')
def getSectionsHTML(inputfiletext):
    #a = open(inputfile)
    # Grabs the whole page 
    #inputfiletext = read(a)
    #a.close()
    #tree = html.parse(inputfiletext) 
    tree = html.document_fromstring(inputfiletext) 
    sections = divsel(tree) # creates a list of the div elements of the document
    #creates a list of tuples for each section: section number and html content
    sections_html = [(section.get("id"), etree.tostring(section)) for section in sections]         
    return  sections_html
    
def saveSectionFile(codeinput, filename, inputfiletext):
    code_instance = models.Code.objects.get(name = codeinput)
    sectionfile_current = models.SectionFile(
Beispiel #10
0
def first(selector, html):
    res = cs(selector)(html)
    if not res or not len(res):
        return None
    return res[0]