Example #1
0
def define(word):
    url = 'http://www.google.com/dictionary/json?callback=a&sl=en&tl=en&q=' + word + '&restrict=pr%2Cde&client=te'
    headers = {'User-Agent' : 'Mozilla 5.10'}
    request = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(request)
    jsonstring = '[' + response.read()[2:-1] + ']'
    #To replace hex characters with ascii characters
    response = re.compile(r'\\x(\w{2})')
    ascii_string = response.sub(asciirepl, jsonstring)
    data = json.loads(ascii_string)
    try:
        defs = data[0]['primaries'][0]['entries']
    except KeyError:
        try:
            defs = data[0]['webDefinitions'][0]['entries']
        except KeyError:
            raise ValueError('no definition')
    # Get first definition of the word
    for entry in defs:
        if entry['type'] == 'meaning':
            # Clean the definition of any html
            html = lxml.html.fragment_fromstring('<p>' + entry['terms'][0]['text'] + '</p>')
            definition = html.text_content()
            break
    return definition.encode('utf8')
Example #2
0
    def links_grab(self, url):
        if self.incorrect_url_ending(url): return []
        if self.basic_auth_required:
            r = requests.get(url,
                             auth=HTTPBasicAuth(self.username, self.password))
        else:
            r = requests.get(url)
        html = lxml.html.fromstring(unidecode(r.text))
        content = str(unidecode(html.text_content()))
        content = "".join(
            [elem for elem in content if not elem in ["\t", "\n", "\r"]])
        content = " ".join([elem for elem in content.split(" ") if elem != ''])
        #if self.save_to_database: self.db.save(url,content) #saves to the database, database.db
        datum = {}
        try:
            datum["title"] = html.xpath("//title")[0].text_content()
        except:
            print r.url

        if r.url.startswith("/"):
            datum["url"] = self.domain_name + r.url
        else:
            datum["url"] = r.url
        self.data.append(datum)
        url_list = html.xpath("//a/@href")
        uri_list = []
        for uri in url_list:
            if uri.startswith("/"):
                uri_list.append(self.domain_name + uri)
            else:
                uri_list.append(uri)
        return uri_list + [url]  #ensures the url is stored in the final list
Example #3
0
def parse(url):
	data = urllib2.urlopen(url)
	d = feedparser.parse(data.read())
	
	feed = {
		'title': d.feed.title,
		'subtitle': d.feed.subtitle,
		'link': d.feed.link,
		'articles': []
	}
	
	for entry in d.entries:
		html = lxml.html.fromstring(entry.summary)
		img = html.xpath("//img")
		text = html.text_content().strip()
		if img is not None and len(img) > 0:
			img = img[0].attrib['src']
		else:
			img = ''
	
		article = {
			'title': entry.title,
			'content': text,
			'author': entry.author if 'author' in entry else '',
			'link': entry.link,
			'photo': img,
			'date': datetime.datetime.fromtimestamp(time.mktime(entry.updated_parsed))
		}
		feed['articles'].append(article)
	return feed
Example #4
0
    def _get_uks_link_mp3_cambridge(self, org_word, word, item):
        BASE_URL = 'http://dictionary.cambridge.org/dictionary/english/'
        url = BASE_URL + word
        print(url)
        html = requests.get(url).content                                          
        tree = lxml.html.fromstring(html)
        uks = tree.xpath("//span[@class='sound audio_play_button pron-icon uk']/@data-src-mp3")

        #pos_header = tree.xpath("//div[@class='pos-header']")[0]
        # //*[@id="dataset-british"]/div[1]/div[2]/div/div/div[1]/span[2]
        # //*[@id="dataset-british"]/div[1]/div[2]/div/div/div[1]/span[2]/span
        # //*[@id="dataset-british"]/div[1]/div[2]/div/div/div[1]/span[@class='uk']/span[@class='pron']/span[@class='ipa']/text()
        # uks_pron = tree.xpath("//span[@class='uk']/span[@class='pron']/span[@class='ipa']/text()")
        uks_pron_html = tree.xpath("//*[@id='dataset-british']/div[1]/div[2]/div/div/div[1]/span[@class='uk']/span[@class='pron']/span[@class='ipa']")
        sqlVocab = SqliteVocabulary("studyenglish.db", "vocabulary")
        #import xml.etree.ElementTree as ET
        uks_pron = [html.text_content() for html in uks_pron_html]
        prons = u'/' + u'/,/'.join(uks_pron) + u'/'
        #if uks_pron:
        #    prons = u'/' + uks_pron[0] + u'/'
        self.tree.set(item,'#2',prons)
        if len(uks_pron)>0:
            sqlVocab.update_uk_pron(org_word, prons)

        return uks
Example #5
0
 def parse_full_10K(cls, doc):
     text = ""
     for child in doc.getchildren():
         if child.tag == 'sec-header':
             continue
         html, properties = TXTML.get_HTML_from_document(child)
         if re.search('10[-]*[KQ]', properties['type']):
             text = text + html.text_content()
     return text
Example #6
0
def getTodos(projects, objects):
  """
    Get todos for each project
  """
  tags_dict = getTags(objects)
  for project in projects:
    for ref_id in project['ref_ids'].split():
      for object in objects:
        if object.attributes['id'].value == ref_id:
          attribute_nodes = object.getElementsByTagName("attribute")
          title        = ""
          content      = ""
          datemodified = ""
          datecreated  = ""
          datecompleted= ""
          tags         = ""           
          for attribute_node in attribute_nodes:
            if attribute_node.attributes['name'].value == 'title':
              if attribute_node.childNodes:
                  title = attribute_node.childNodes[0].nodeValue.encode("utf-8")
                  break
          # Check if todo has a note attached
          if title:
            for attribute_node in attribute_nodes:
              # <attribute name="datemodified" >309306984.40529602766036987305
              if attribute_node.attributes['name'].value == 'datemodified':
                datemodified = convertCocoaEpoch(attribute_node.childNodes[0].\
                    nodeValue.encode("utf-8"))
              # <attribute name="datecreated" >306520491.00000000000000000000
              if attribute_node.attributes['name'].value == 'datecreated':
                datecreated = convertCocoaEpoch(attribute_node.childNodes[0].\
                    nodeValue.encode("utf-8"))
              #<attribute name="datecompleted" type="date">292880221.18648099899291992188
              if attribute_node.attributes['name'].value == 'datecompleted':
                datecompleted = convertCocoaEpoch(attribute_node.childNodes[0].\
                    nodeValue.encode("utf-8"))
              if attribute_node.attributes['name'].value == 'content':
                content = attribute_node.childNodes[0].nodeValue #.encode("utf-8")
                # lets encode in writeOutline               
                # I think we need to translate all this things
                html = content.replace('\\u3c00', '<').replace('\\u3e00', '>') 
                html = html.replace('\u2600', '&')
                html = lxml.html.fromstring(html)
                content = html.text_content().split('\n')
                for l in html.iterlinks():
                    content += [l[2]]
            relationship_nodes = object.getElementsByTagName("relationship")
            for relationship_node in relationship_nodes:
              if relationship_node.attributes['name'].value == 'tags':
                try:
                  tags_id = relationship_node.attributes['idrefs'].value
                  tags = [tags_dict[t_id] for t_id in tags_id.split()]
                except:
                  tags = ""

          project['todos'].append([title, content, datecreated, datemodified, datecompleted, tags])
  return projects
Example #7
0
def index():
    if request.method == 'GET':
        return render_template('index.html')
    if request.method == 'POST':
        url = request.form.get('url')
        data = urllib.request.urlopen(url).read()
        #soup = BeautifulSoup(data,"lxml")
        html = lxml.html.fromstring(data)
        #text=soup.get_text()
        text = html.text_content()
        return render_template('output.html', result=analyze_text(text))
Example #8
0
 def links_grab(self,url):
     if self.incorrect_url_ending(url): return [] 
     if self.basic_auth_required:
         r = requests.get(url,auth=HTTPBasicAuth(self.username, self.password))
     else:
         r = requests.get(url)
     html = lxml.html.fromstring(unidecode(r.text))
     #checks if VEC urls are in page
     #remove this after VEC goes live
     VEC = [
         "/veterans-employment-center/",
         "https://www.vets.gov/veterans-employment-center/",
         "https://www.vets.gov/employment/",
         "/employment/"
     ]
     if any([uri in html.xpath("//a/@href") for uri in VEC]): 
         print "found"
         return []
     
     content = str(unidecode(html.text_content()))
     if "Coming Soon." in content:
         print "found"
         return []
     content = "".join([elem for elem in content if not elem in ["\t","\n","\r"]])
     content = " ".join([elem for elem in content.split(" ") if elem != ''])
     if self.save_to_database: self.db.save(url,content) #saves to the database, database.db
     datum = {}
     try:
         datum["title"] = html.xpath("//title")[0].text_content()
     except:
         print r.url
     datum["path"] = r.url
     datum["created"] = self.created
     datum["content"] = content
     
     if self.testing:
         datum["description"] = 'none'
     else:
         try:
             datum["description"] = str(self.df.Description[self.df.Page_Address == r.url].tolist()[0]) 
         except:
             datum["description"] = "none"
     datum["promote"] = "false"
     datum["language"] = "en"
     #remove after VEC goes live
     if not "veterans-employment-center" in url: self.data.append(datum)
     url_list = html.xpath("//a/@href") 
     uri_list = []
     for uri in url_list:
         if uri.startswith("/"):
             uri_list.append(self.domain_name+uri)
         else:
             uri_list.append(uri)
     return uri_list + [url] #ensures the url is stored in the final list
Example #9
0
def get_info(file):
    with open(os.getcwd()+"/WEBPAGES_RAW/"+file,encoding = 'utf-8') as file:
        textcont = file.read()
        html = lxml.html.fromstring(textcont)#gets the title of the page and a little bit of the content
    title = ""
    for tag in html.xpath("//title"):
        title += tag.text_content()
        title += ""
#    body = html.xpath("//body").text_content()
    intro = " ".join([w for w in re.split(r"[^a-zA-Z0-9]",html.text_content()[:400]) if len(w) > 0])
    return (title, intro.lower())
Example #10
0
 def links_grab(self,url):
     if self.incorrect_url_ending(url): return [] 
     if self.basic_auth_required:
         r = requests.get(url,auth=HTTPBasicAuth(self.username, self.password))
     else:
         r = requests.get(url)
     html = lxml.html.fromstring(unidecode(r.text))
     #checks if VEC urls are in page
     #remove this after VEC goes live
     VEC = [
         "/veterans-employment-center/",
         "/employment/"
     ]
     if any([uri in html.xpath("//a/@href") for uri in VEC]): 
         print "found"
         return []
     
     content = str(unidecode(html.text_content()))
     content = "".join([elem for elem in content if not elem in ["\t","\n","\r"]])
     content = " ".join([elem for elem in content.split(" ") if elem != ''])
     if self.save_to_database: self.db.save(url,content) #saves to the database, database.db
     datum = {}
     try:
         datum["title"] = html.xpath("//title")[0].text_content()
     except:
         print r.url
     datum["path"] = r.url
     datum["created"] = self.created
     datum["content"] = content
     
     if self.testing:
         datum["description"] = 'none'
     else:
         try:
             datum["description"] = str(self.df.Description[self.df.Page_Address == r.url].tolist()[0]) 
         except:
             datum["description"] = "none"
     datum["promote"] = "false"
     datum["language"] = "en"
     #remove after VEC goes live
     if not "veterans-employment-center" in url: self.data.append(datum)
     url_list = html.xpath("//a/@href") 
     uri_list = []
     for uri in url_list:
         if uri.startswith("/"):
             uri_list.append(self.domain_name+uri)
         else:
             uri_list.append(uri)
     return uri_list + [url] #ensures the url is stored in the final list
Example #11
0
 def links_grab(self,url):
     r = requests.get(url)
     domain_name = self.get_domain_name(url)
     html = lxml.html.fromstring(unidecode(r.text))
     content = str(unidecode(html.text_content()))
     if self.save_to_database: self.db.save(url,content) #saves to the database, database.db
     self.data.append(content)
     url_list = html.xpath("//a/@href") 
     uri_list = []
     for uri in url_list:
         if uri.startswith("/"):
             uri_list.append(domain_name+uri)
         else:
             uri_list.append(uri)
     return uri_list + [url] #ensures the url is stored in the final list
Example #12
0
 def links_grab(self, url):
     r = requests.get(url)
     domain_name = self.get_domain_name(url)
     html = lxml.html.fromstring(unidecode(r.text))
     content = str(unidecode(html.text_content()))
     if self.save_to_database:
         self.db.save(url, content)  #saves to the database, database.db
     self.data.append(content)
     url_list = html.xpath("//a/@href")
     uri_list = []
     for uri in url_list:
         if uri.startswith("/"):
             uri_list.append(domain_name + uri)
         else:
             uri_list.append(uri)
     return uri_list + [url]  #ensures the url is stored in the final list
Example #13
0
def scrape():
    data_items = []
    rss_xml = lxml.html.fromstring(scraperwiki.scrape(rss_url))
    for entry_el in rss_xml.findall('entry'):
        alert_id = entry_el.findtext('id').partition('.post-')[2]
        location_name = entry_el.findtext('title').replace('Rower Notification - ', '')
        html = lxml.html.fromstring(saxutils.unescape(entry_el.findtext('content')).replace('<BR>', '\n').replace('<br />', '\n').partition('___')[0].partition('Regards')[0])
        note_text = html.text_content().strip()
        (subject, sep, detail) = note_text.partition('\n')
        location_desc = subject.replace('Rower notification from Thames Water:', '').strip() # Remove prefix
        date_time = entry_el.findtext('published').partition('.')[0].replace('T', ' ')
        alert_detail = detail.strip()
        data_items.append({ 'alert_id': alert_id, 'loc_name': location_name, 'loc_desc': location_desc, 'published': date_time, 'detail': alert_detail })

    # Save the data
    scraperwiki.sqlite.save(unique_keys=['alert_id'], data=data_items, table_name='alerts', verbose=data_verbose)
Example #14
0
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False

    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
Example #15
0
def get_clean_html(etree, text_only=False):
    _is_etree(etree)
    # enable filters to remove Javascript and CSS from HTML document
    cleaner = Cleaner()
    cleaner.javascript = True
    cleaner.style = True
    cleaner.html = True
    cleaner.page_structure = False
    cleaner.meta = False
    cleaner.safe_attrs_only = False
    cleaner.links = False
    
    html = cleaner.clean_html(etree)
    if text_only:
        return html.text_content()

    return lxml.html.tostring(html)
Example #16
0
def get_string(html_text):
    html = lxml.html.fromstring(html_text)
    remove_tags = ('.//style', './/script', './/noscript')
    for remove_tag in remove_tags:
        for tag in html.findall(remove_tag):
            tag.drop_tree()
            # ここでの削除は元の変数tに反映されます。

    codeframe_list = []
    lang_list = []
    # コードの削除
    for tag in html.findall(".//div[@class='code-frame']"):
        codeframe_list.append(tag.text_content())
        lang_list.append(tag.attrib["data-lang"])
        tag.drop_tree()

    atext_list = []
    ahref_list = []
    # href リンクの削除
    for tag in html.cssselect('a'):
        if tag.text is not None:
            atext_list.append(tag.text)
        if tag.get('href') is not None:
            ahref_list.append(tag.get('href'))
        tag.drop_tree()

    code_list = []
    # 一行コードの削除
    for cc in html.cssselect('code'):
        if cc.text is not None:
            code_list.append(cc.text)
        cc.drop_tree()

    text = html.text_content().strip('\n')

    return pd.Series(
        [
            "".join(text.split('\n')), ",".join(codeframe_list),
            ",".join(lang_list), ",".join(code_list), ",".join(atext_list),
            ",".join(ahref_list)
        ],
        index=['text', 'code-frame', 'lang', 'code', 'a-text', 'a-href'])
Example #17
0
def getParcel(streetnum, street):
    streetname = street.rsplit(" ",1)[0]
    pageURL = "http://www.padctnwebpro.com/WebproNashville/searchResults.asp?cboSearchType=Street&SearchVal1=" + urllib.quote(streetname) + "&SearchVal2=" + urllib.quote(streetnum)
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    html = lxml.html.parse(opener.open(pageURL)).getroot()
    rows = html.cssselect('#T1 tbody tr')
    parcelID = ""
    if len(rows) > 1:
        i = 0
        while i <= len(rows)-1:
            parcelID += list(rows[i])[0].text_content().replace(" ", "-")
            if i < len(rows)-1:
                parcelID += "|"
            i += 1
    else:
        if len(rows) == 1:
            parcelID = list(rows[0])[0].text_content().replace(" ", "-")
        else:
            print "pageURL: " + pageURL
            print "html: " + html.text_content()
    print "parcelID = " + parcelID
    return parcelID
def getParcel(streetnum, street):
    streetname = street.rsplit(" ",1)[0]
    pageURL = "http://www.padctnwebpro.com/WebproNashville/searchResults.asp?cboSearchType=Street&SearchVal1=" + urllib.quote(streetname) + "&SearchVal2=" + urllib.quote(streetnum)
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    html = lxml.html.parse(opener.open(pageURL)).getroot()
    rows = html.cssselect('#T1 tbody tr')
    parcelID = ""
    if len(rows) > 1:
        i = 0
        while i <= len(rows)-1:
            parcelID += list(rows[i])[0].text_content().replace(" ", "-")
            if i < len(rows)-1:
                parcelID += "|"
            i += 1
    else:
        if len(rows) == 1:
            parcelID = list(rows[0])[0].text_content().replace(" ", "-")
        else:
            print "pageURL: " + pageURL
            print "html: " + html.text_content()
    print "parcelID = " + parcelID
    return parcelID
Example #19
0
def get_clean_text(file):
    
    #strings of important html tags
    title = ""
    h1 = ""
    h2 = ""
    h3 = ""
    bold = ""
    
    try:
        with open(file, 'r', encoding="utf-8") as f:
            textcont = f.read()
            html = lxml.html.fromstring(textcont)
            
    except:
        return ("", "", "", "", "", "")
    words = html.text_content()
    
    #get the content in important tags
    for tag in html.xpath("//title"):
        title += tag.text_content()
        title += ""
    for tag in html.xpath("//h1"):
        h1 += tag.text_content()
        h1 += ""
    for tag in html.xpath("//h2"):
        h2 += tag.text_content()
        h2 += ""
    for tag in html.xpath("//h3"):
        h3 += tag.text_content()
        h3 += ""
    for tag in html.xpath("//bold"):
        bold += tag.text_content()
        bold += ""
     
    #return a tuple
    return (words.lower(), title.lower(), h1.lower(), h2.lower(), h3.lower(), bold.lower())
Example #20
0
list_dir = sorted(list_dir)

id = 0
files = sorted(os.listdir("developer/DEV/" + list_dir[0]))
f = open("idToIndex.txt", "w")
for each_sub in list_dir:
    for each_json in sorted(os.listdir("developer/DEV/" + each_sub)):
        with open("developer/DEV/" + each_sub + "/" + each_json,
                  encoding="utf-8") as read_file:
            stemProcesser = PorterStemmer()
            data = json.load(read_file)
            data_content = data["content"]
            data_content = data_content.encode("utf-8", "ignore")
            try:
                html = lxml.html.fromstring(data_content)
                parse_html_content = html.text_content()
                token_list = []

                parse_html_content = parse_html_content.translate(table)

                parse_html_split = parse_html_content.rsplit()

                new_list = [
                    token for token in parse_html_split if token.isascii()
                ]  #list of token

                after_stem = []

                for word in new_list:  #stem processing
                    after_stem.append(stemProcesser.stem(word))
                    #print(after_stem)
Example #21
0
# scrape the San Luis Obispo Police Department Summary Report,
# parse the responses, then add them to the CSV.

from lxml import html
import requests
import csv


def oreo(s, start, end):
    return ((s.split(start))[1].split(end)[0]).strip()


page = requests.get('http://pdreport.slocity.org/policelog/rpcdsum.txt')
html = html.fromstring(page.content)
text = html.text_content()
cut_footer = text.split(
    '--------------------------------------------------------------------------------'
)[0]
els = cut_footer.split(
    '==============================================================================='
)

header = els.pop(0)
new_log_day = oreo(header, '\n', 'Summary Report').strip()

print 'DOWNLOADING SLO POLICE DATA...'

i = 0
items = []

for el in els:
Example #22
0
def _findTextInNode(bs_node_or_text):
    if isinstance(bs_node_or_text, basestring):
        return bs_node_or_text
    else:
        html = lxml.html.fromstring(unicode(bs_node_or_text))
        return html.text_content()
Example #23
0
# scrape the San Luis Obispo Police Department Summary Report,
# parse the responses, then add them to the CSV.

from lxml import html
import requests
import csv

def oreo(s, start, end):
  return ((s.split(start))[1].split(end)[0]).strip()

page = requests.get('http://pdreport.slocity.org/policelog/rpcdsum.txt')
html = html.fromstring(page.content)
text = html.text_content()
cut_footer = text.split('--------------------------------------------------------------------------------')[0]
els = cut_footer.split('===============================================================================')

header = els.pop(0)
new_log_day = oreo(header, '\n', 'Summary Report').strip()

print 'DOWNLOADING SLO POLICE DATA...'

i = 0
items = []

for el in els:
    el = el.strip()

    if ( i % 2 == 0 ):
        time_data = el.split(' ')
        item = {}
        item['date'] = time_data[1]
Example #24
0
def get_text_from_html(html_str):
    html = lxml.html.fromstring(html_str)
    contents = html.text_content().strip()
    return contents
Example #25
0
def get_clean_text(html):
    """
    Removes extra blank spaces and nbsp from html text.
    """
    return " ".join(html.text_content().split())
Example #26
0
def _get_descr(event):
    event = _fetch(_URL, {'sch.tv_event_id': event['id']})['tv_event']
    html = lxml.html.fragment_fromstring(event['descr'], create_parent=True)
    return html.text_content()
Example #27
0
def get_canonical_dom(text_id, html):
    """
	Gets a list of first level dom elements part of the law project (titre, chapitre, articles)
	by removing unwanted parts (exposé des motifs, décret)
	(laws are usually made of few div containing lots of p elements)
	:param text_id:
	:param dom_elements:
	:return:
	"""

    if 'Projet de loi de finances' in html.text_content()[0:1000]:
        # Projet de loi finance (ex: PRJLANR5L15B2272)
        return None

    # law parts are in div with class assnatSection but not all of them are meaningful
    dom_elements = html.xpath(
        "/html/body/div[starts-with(@class,'assnatSection')]")

    if dom_elements:
        dom_element = dom_elements[-1]
        if search(r'Annexe \w', dom_element.getchildren()[1].text_content()):
            # If there is one or more annexe (with title formatted as Annexe X), each one is in
            # a different div. Thus, more than one div should be aggregated to make the final draft
            # ex: PRJLANR5L15B2296, PRJLANR5L15B2416
            # Also, this should be in addition to an empty div (next case).
            # This part of the method should probably be recursive...
            children = []
            pass
        elif 'PRJ' in text_id and len(dom_element.text_content()) < 10:
            # if div content is less thant 10 char, going for previous block
            children = dom_elements[-2]
        elif 'PRJ' in text_id and 'RAPPORT ANNEXÉ' in dom_element.text_content(
        ):
            # if there is one "rapport annexé", taking both the previous div and this one
            children = dom_elements[-2].getchildren()
            children.extend(dom_elements[-1].getchildren())
        else:
            # otherwise, take all tags of the last div
            children = dom_element.getchildren()

        if not len(children):
            return None

        count_tags = reduce(count_by_element_type, children, {})
        # max_tag = (None, 0)
        # for count_tag in count_tags.items():
        # 	if count_tag[1] > max_tag[1]:
        # 		max_tag = count_tag
        max_tag = reduce(lambda acc, x: x if x[1] < acc[1] else acc,
                         count_tags.items(), (None, 0))
        if max_tag[0] == 'div':
            # some law are made of div containg lots of div (not the same format) => PRJLANR5L15BTC3995
            return None

        # print(count_tags)
        count_class = reduce(
            lambda acc, x: (acc + 1) if x.get('class') else acc, children, 0)
        # print(count_class)
        if count_class < 0.1 * len(children):
            # less than 10% of tags have a class, considered unparsable (projet de loi finance / budget)
            return None
        text_content = dom_element.text_content()
        res = []
        if 'PION' in text_id:
            # print('EXPOSÉ DES MOTIFS') if 'EXPOSÉ DES MOTIFS' in text_content else print('...')
            # print('PROPOSITION DE LOI') if 'proposition de loi' in text_content else print('...')
            if 'EXPOSÉ DES MOTIFS' in text_content and 'PROPOSITION DE LOI' in text_content:
                # in some law text, there is a first section, not meaningful that needs to be removed
                for index in range(len(children)):
                    if 'PROPOSITION DE LOI' in children[index].text_content():
                        res = children[index + 1:]
                        break
                else:
                    # FIXME weird case
                    res = []
            else:
                res = children
        elif 'PRJ' in text_id:
            if 'table' not in count_tags:
                # if there is a table, it might be for: a signed pre-text or a table inside the law
                # but if there isn't, all the tags can be taken
                res = children
            else:
                for index in range(len(children)):
                    if 'Articleliminaire' in compile(r'\s+').sub(
                            '', children[index].text_content()):
                        res = children[index:]
                        break
                    if 'Article unique' in children[index].text_content():
                        res = children[index:]
                        break
                    if 'Article 1' in children[index].text_content():
                        # if article 1 is found, going backwards to search for chapter and part titles
                        for jndex in range(index, 0, -1):
                            if children[jndex].get(
                                    'class') and 'assnatLoiTexte' in children[
                                        jndex].get('class'):
                                # print(jndex)
                                res = children[jndex + 1:]
                                break
                        else:
                            # if not found, we suspect that there is nothing before hence,
                            # ever children are taken
                            res = children
                        break
        return res
Example #28
0
	def crawl(self, url):
		try:
			global num_of_visited_links, pq, total_downloaded_size
			
			#Checking if the specified page limit is reached or not
			if len(pq)>total_page_limit:
				return
			
			try:
				#Extracting home link of current page to get URL/robots.txt
				rpurl = url
				if '/' in url[url.index('.'):]:
					rpurl = url[:url.index('/',url.index('.'))]
				rpurl = rpurl + '/robots.txt'
				#Checking if current url can be visited
				rp = robotparser.RobotFileParser()
				rp.set_url(rpurl)
				rp.read()
				if not rp.can_fetch("*", url):
					return
			except Exception as e:
				if debug_mode:
					print 'Error in Robot Parser : '+str(e)
				pass
			
			#Opening the url and reading its content	
			page = urllib.urlopen(url)
			content = page.read()
			#content = content[content.index("<body"):content.index("</body")]
			
			#Using lxml library to extract html from the content
			html = lxml.html.fromstring(content)
			#Modifying all relative urls present in html file and making
			#them absolute using the specified url passed as parameter
			html.make_links_absolute(url)
			
			#Incrementing the total downloaded size of all pages
			total_downloaded_size = total_downloaded_size + sys.getsizeof(content)
			
			#Incrementing no. of visited links for each page visited
			num_of_visited_links = num_of_visited_links + 1
			
			#Printing out <Links visited : __ | Relevent Links : __ >
			stdout.write("\rLinks Visited : %i | Relevant Links : %i" % (num_of_visited_links, len(pq)))
			stdout.flush()
			
			#Extracting text content
			text_content = html.text_content()

			#Calculating word count for given query
			word_count = 0
			query_words = self.query.split()
			for w in query_words:
				word_count += text_content.count(w)
				
			#If word count is less than 1; page irrelevant!	
			if word_count < 1 :
				return

			#Checking is the present url is an anchor jump url
			#If yes, then extract the parent link from anchor jump url
			#and use that as current url if not already present in the 
			#priority queue
			anchor_jump_link = ''
			if '#' in url:
				anchor_jump_link = url[:url.index('#')]
				#if self.pagevisited(anchor_jump_link):
				#	return
				#else:
				url = anchor_jump_link

			#Check to see if the page already exists or not
			#If yes, then increase the page priority by one and return
			#If not, then acquire lock and write url details onto links.txt
			#and push [priority, url] into pq (priority queue)
			existing_page = self.pagevisited(url)			
			if not existing_page:
				self.lock.acquire()
				self.file.write('Link : '+url+' | Word Count(Priority) : '+str(word_count)+'\n')
				self.lock.release()
				heappush(pq, [-word_count, url])
			else:
				existing_page[0] = existing_page[0] + (-1)
				return
				
			#Extracting all anchor tag elements	
			urls = html.xpath('//a')
			#Iterating though all anchor tags that were extracted
			for u in urls:
			#Extracting the href attribute of each anchor tag for get a url
				link = u.get('href')
				#If extracted url not already visited, converting it to 
				#lower case and passing it to crawl method recursively
				if not self.pagevisited(link):
					if issubclass(type(link), unicode) or issubclass(type(link), str):
						self.crawl(link.lower())
						
		except Exception as e:
			if debug_mode:
				print 'Error in parser : url<'+url+'> : ' + str(e)
			pass
Example #29
0
	def run(self):
		try:
			global num_of_visited_links, pq, total_downloaded_size
			
			#Checking if the specified page limit is reached or not
			if len(pq)>total_page_limit:
				return
			#Checking if the type of url is correct
			if not issubclass(type(self.url), unicode) and not issubclass(type(self.url), str):
				return
			
			#Opening the url and reading its content	
			page = urllib.urlopen(self.url)
			content = page.read()
			#content = content[content.index("<body"):content.index("</body")]
			
			#Using lxml library to extract html from the content
			html = lxml.html.fromstring(content)
			#Modifying all relative urls present in html file and making
			#them absolute using the specified url passed as parameter
			html.make_links_absolute(self.url)
			
			#Incrementing the total downloaded size of all pages
			total_downloaded_size = total_downloaded_size + sys.getsizeof(content)
			
			#Incrementing no. of visited links for each page visited
			num_of_visited_links = num_of_visited_links + 1
			
			#Printing out <Links visited : __ | Relevent Links : __ >
			#stdout.write("\rLinks Visited : %i | Relevant Links : %i" % (num_of_visited_links, len(pq)))
			#stdout.flush()
			
			#Extracting text content
			text_content = html.text_content()

			#Calculating word count for given query
			word_count = 0
			query_words = self.query.split()
			for w in query_words:
				word_count += text_content.count(w)
				
			#If word count is less than 1; page irrelevant!
			if word_count < 1 :
				return
			
			#Extracting all anchor tag elements
			urls = html.xpath('//a')
			#Iterating though all anchor tags that were extracted
			for u in urls:
				#Extracting the href attribute of each anchor tag for get a url
				link = u.get('href')
				#If extracted url not already visited, converting it to 
				#lower case and passing it to crawl method
				if not self.pagevisited(link):
					if issubclass(type(link), unicode) or issubclass(type(link), str):
						self.crawl(link.lower())
								
		except Exception as e:
			if debug_mode:
				print "Exception in Run : "+ str(e)
			pass
Example #30
0
def _findTextInNode(bs_node_or_text):
    if isinstance(bs_node_or_text, basestring):
        return bs_node_or_text
    else:
        html = lxml.html.fromstring(unicode(bs_node_or_text))
        return html.text_content()
Example #31
0
def test_character(response, excerpt):
    html = lxml.html.fromstring(response.content)
    assert excerpt in subparsers.character(html.text_content())
Example #32
0
def test_linear_feet(response, result):
    html = lxml.html.fromstring(response.content)
    assert subparsers.linear_feet(html.text_content()) == result