Example #1
0
 def scrapeStep(self, root):
     br = mechanize.Browser()
     br.set_handle_robots(False)
     br.addheaders = [('User-agent', 'Firefox')]
     print root
     try:
         html1 = br.open(root).read()
         try:
             art_text = getarticle.getReadableArticle(root)
             print art_text
             if len(art_text) > 0:
                 myfile = open("categorieslist.txt", "a")
                 try:
                     art_text = art_text.decode('UTF-8')
                     if len(art_text) > 500:
                         myfile.write(art_text + "\n")
                 except:
                     a = 0
                 myfile.close()
         except:
             s = 0
         for link in br.links():
             newurl = urlparse.urljoin(link.base_url, link.url)
             if self.root in newurl.replace(
                     "www.", "") and newurl not in self.visited:
                 self.urls.append(newurl)
                 self.visited.append(newurl)
                 print newurl
     except:
         f = 0
Example #2
0
 def scrapeStep(self,root):
     br = mechanize.Browser()
     br.set_handle_robots(False)
     br.addheaders = [('User-agent', 'Firefox')]
     try:
         print "opening"+url
         html1 = br.open(root).read() 
         try:
             art_text = getarticle.getReadableArticle(root)
         except:
             print "PHAIL"
         for link in br.links():
             newurl =  urlparse.urljoin(link.base_url,link.url)
             if self.root in newurl.replace("www.","") and newurl not in self.visited:
                 self.urls.append(newurl) 
                 self.visited.append(newurl)
                 print newurl
     except:
         f = 0               
Example #3
0
def getTrans(url):
    article_attrs = getarticle.getReadableArticle(url)
    readable = ""
    readable_article = article_attrs[0]
    for u in readable_article:
        try:
            readable += u.encode("UTF-8")
        except:
            c = 0
    readable_title = article_attrs[1]

    chinese = test1.translateString("en", "zh-CN", str(readable))
    english = test1.translateString("zh-CN", "en", chinese)

    # print readable_article[100:200]
    # print chinese[100:200]
    # print english[100:200]
    # print english
    return english
 def scrapeStep(self, root):
     br = mechanize.Browser()
     br.set_handle_robots(False)
     br.addheaders = [('User-agent', 'Firefox')]
     try:
         print "opening" + url
         html1 = br.open(root).read()
         try:
             art_text = getarticle.getReadableArticle(root)
         except:
             print "PHAIL"
         for link in br.links():
             newurl = urlparse.urljoin(link.base_url, link.url)
             if self.root in newurl.replace(
                     "www.", "") and newurl not in self.visited:
                 self.urls.append(newurl)
                 self.visited.append(newurl)
                 print newurl
     except:
         f = 0
Example #5
0
def getTrans(url):
	article_attrs = getarticle.getReadableArticle(url)
	readable = ""
	readable_article = article_attrs[0]
	for u in readable_article:
	    try:
	        readable+= u.encode('UTF-8')
	    except:
	        c=0
	readable_title = article_attrs[1]



	chinese = test1.translateString("en","zh-CN",str(readable))
	english = test1.translateString("zh-CN","en",chinese)

	#print readable_article[100:200]
	#print chinese[100:200]
	#print english[100:200]
	#print english
	return english
import gethtml
import articletext
import google_translate
# import articletrans
import getarticle
from bs4 import BeautifulSoup

url = "http://www.nytimes.com/2013/11/14/us/politics/democrats-threaten-to-abandon-obama-on-health-law-provision.html?hp&_r=0"

article_attrs = getarticle.getReadableArticle(url)
readable_article = article_attrs[0]
readable_title = article_attrs[1]

arabic = google_translate.translateString("en","ar",str(readable_article))
english = google_translate.translateString("ar","en",arabic)

# Google Translate Article Generator Python
# https://www.youtube.com/watch?v=jweHHc5nAwE