Esempio n. 1
0
    def processArticle(self, response):
        url = response.url
        score = calc_score(url)
        if score >= 3:
            try:
                print 'this url maybe a news_url'
                item = ArticleItem()
                g = Goose()
                article = g.extract(url=url)
                title = article.title
                content = article.cleaned_text
                if len(content) == 0:
                    print 'news in chinese'
                    g = Goose({'stopwords_class': StopWordsChinese})
                    article = g.extract(url=url)
                    content = article.cleaned_text
                item['articleTitle'] = title
                item['articleUrl'] = url
                item['articleContent'] = content
                yield item
            except:
                self.logger.info('item in article failed')

        else:
            print 'this url maybe not a news_url, ' + ' score only ' + str(
                score)
            print 'you can check this url: ' + url
            return
Esempio n. 2
0
def get_url_extract_body_text(url, config=None):
    # url ="https://edition.cnn.com/2012/02/22/world/europe/uk-occupy-london/index.html?hpt=ieu_c2"
    if config:
        g = Goose(config)
    else:
        g = Goose()
    article = g.extract(url=url)
    return article.cleaned_text
Esempio n. 3
0
	def __init__(self,url,author):
 		self.url = url
		self.author = author
		goose = Goose({'stopwords_class': StopWordsChinese})
		article = goose.extract(url=url)
		if article.title == '':
			goose = Goose()
			article = goose.extract(url=url)
		self.title = article.title
		self.summary = article.cleaned_text[:150]
		self.body = article.cleaned_text
Esempio n. 4
0
def get_url_extract_links(url, config=None):
    # url ="https://edition.cnn.com/2012/02/22/world/europe/uk-occupy-london/index.html?hpt=ieu_c2"
    if config:
        g = Goose(config)
    else:
        g = Goose()
    article = g.extract(url=url)
    # print('title is ', article.title)
    # print('cleaned_text is ', article.cleaned_text)
    # print('meta is ', article.meta_description)
    # print('link is', article.canonical_link)
    print('link is:')
    for link in article.links:
        print(link)
Esempio n. 5
0
def getproxyip_list(urllist):
    proxyurllist = []
    for url in urllist:
        time.sleep(10)
        g = Goose()
        article = g.extract(url=url)
        soup = BeautifulSoup(article.raw_html, "html.parser")
        proxy_list = soup.find_all("tr")
        for proxyip in proxy_list:
            iplist = []
            all_td = proxyip.find_all("td")
            if all_td:
                for td_line in all_td:
                    val = td_line.text.strip()
                    if val:
                        iplist.append(val)
                    else:
                        if td_line.div:
                            iplist.append(td_line.div["title"])
                        else:
                            iplist.append(u"中国")
                proxyurllist.append(iplist)
                print(iplist)

    return proxyurllist
Esempio n. 6
0
def getUrl(item):
    url_name = re.split('&&', item)
    url = url_name[1]
    name = url_name[0]
    print url
    print name
    html_name = name + '.html'
    print html_name
    g = Goose({'stopwords_class': StopWordsChinese})
    article = g.extract(url=url)
    # print article.raw_html
    currentDir = os.getcwd() + '/' + 'pages' + '/' + name
    if not os.path.exists(currentDir):
        os.makedirs(currentDir)
    f = open(currentDir + '/' + html_name, 'a')
    google_transfer = open("transfer.js").read()
    print google_transfer

    f.write(google_transfer + article.raw_html)
    f.close()
    print article.title

    f_md = open(currentDir + '/' + name + '.md', 'a')
    f_md.write(article.cleaned_text.encode('utf-8'))
    f_md.close()
Esempio n. 7
0
def handle_message(msg):
    if msg is None or not isinstance(msg, dict):
        print 'message is broken'
        return

    task = msg

    g = Goose()
    article = g.extract(url=task['url'])

    # print article.cleaned_text

    task['text'] = article.cleaned_text

    # # Scraping CNN news
    # text = None
    # if task['source']['id'] == 'cnn':
    #     print "Scraping CNN news"
    #     text = cnn_news_scraper.extractNews(task['url'])
    # else:
    #     print "News source [%s] is not supported." % task['source']['name']
    #
    # task['text'] = text

    dedupe_news_queue_client.sendMessage(task)
Esempio n. 8
0
def get_data(rss, num):

    #pathToCSV = '/Users/Michal/Downloads/dialogflow-java-client-master2/samples/clients/VirtualTradingAssistant/src/main/java/ai/api/examples/fileStore/file.csv'
    #pathToCSV = 'C:\\Users\\ojwoo\\Documents\\Warwick\\CS261\\Coursework\\dialogflow-java-client-master\\samples\\clients\\VirtualTradingAssistant\\src\\main\\java\\ai\\api\\examples\\fileStore\\file.csv'
    #pathToCSV = '/Users/Michal/Desktop/apache-tomcat-8.5.28/bin/misc/file.csv'
    pathToCSV = 'C:\\apache-tomcat-8.5.28\\bin\\misc\\news.csv'

    with open(pathToCSV, 'w') as csvfile:
        wr = csv.writer(csvfile, delimiter='@', quotechar='#')
        index = 0
        for e in rss['entries']:
            if (index == int(num)):
                break

            wr.writerow([(e['title']).encode('utf-8')])
            wr.writerow([(e['link']).encode('utf-8')])

            try:
                g = Goose()
                article = g.extract(url=e['link'])

                cleaned_text = article.cleaned_text

                sent = sentiment(cleaned_text)

                if sent[0] < 0:
                    sent = 50 - (sent[0] * -50)
                else:
                    sent = sent[0] * 50 + 50

                wr.writerow([str(round(sent, 2)) + '%'])
            except TypeError:
                wr.writerow(['Sentiment Unavailable'])

            index = index + 1
Esempio n. 9
0
def extract_title(html):
    """
	 Extract the body title of a web page
	"""
    g = Goose({'enable_image_fetching': False})
    article = g.extract(raw_html=html)
    return article.title
Esempio n. 10
0
    def __init__(self,corpus_dir,datastore_type='file',db_name='corpus.db'):
        '''
        Read links and associated categories for specified articles 
        in text file seperated by a space

        Args:
            corpus_dir (str): The directory to save the generated corpus
            datastore_type (Optional[str]): Format to save generated corpus.
                                            Specify either 'file' or 'sqlite'.
            db_name (Optional[str]): Name of database if 'sqlite' is selected.
        '''

        self.g = Goose({'browser_user_agent': 'Mozilla','parser_class':'soup'})
        #self.g = Goose({'browser_user_agent': 'Mozilla'})
        self.corpus_dir = corpus_dir
        self.datastore_type = datastore_type
        self.db_name = db_name
        self.stats = defaultdict(int)

        self._create_corpus_dir(self.corpus_dir)

        self.db = None
        if self.datastore_type == 'sqlite':
            self.db = self.corpus_dir + '/' + self.db_name
            self._set_up_db(self.db)
Esempio n. 11
0
 def run(self):
     df = pd.read_csv(LINKS_CSV)
     g = Goose({'stopwords_class': StopWordsChinese})
     df['content'] = df['url'].apply(
         lambda x: g.extract(url=x).cleaned_text)
     with self.output().open('w') as f:
         df.to_json(f, orient='records')
Esempio n. 12
0
def crawlerWebLink(url):
    g = Goose()
    article = g.extract(url=url)

    print(article.title)
    print(article.meta_description)
    print(article.cleaned_text)
Esempio n. 13
0
    def parse_item(self, response):
        article = Goose().extract(raw_html=response.body)

        yield Article(title=article.title,
                      text=article.cleaned_text,
                      url=response.url,
                      field=self.name)
Esempio n. 14
0
def sms_ahoy_reply():
    """Respond to incoming messages with a friendly SMS."""
    body = request.values.get('Body', None)
    # Start our response
    with open('news') as data_file:
        current_news = json.load(data_file)
    headlines = current_news['articles']
    message = ""
    if "news" in body or "News" in body:
        headlines = current_news['articles']
        i = 0
        while i < 19:
            message = message + str(i +
                                    1) + ". " + headlines[i]['title'] + "\n"
            i = i + 1
    elif "more" in body:
        i = int(body.split()[0])
        headlines = current_news['articles']
        url = headlines[i - 1]['url']
        g = Goose()
        article = g.extract(url=url)
        message = article.cleaned_text[:1000]
        message2 = article.cleaned_text[1000:][:1000]
        message = message + "..."
    else:
        i = int(body)
        message = ""
        message = headlines[i - 1]['description']
    resp = MessagingResponse()
    # Add a message
    resp.message(message)

    return str(resp)
Esempio n. 15
0
def grab(location, keywords, publication, publication_date, title):
    goose = Goose()
    try:
        raw_article = goose.extract(url=location)
        description = raw_article.meta_description.encode("utf8")
        article = raw_article.cleaned_text.encode("utf8")
        split_keywords = keywords.split(',')

        summary = pyteaser.SummarizeUrl(location)
        output = json.dumps({
            "title": title,
            "keywords": split_keywords,
            "publication": publication,
            "publication_date": publication_date,
            "description": description,
            "source": location,
            "article": article,
            "summary": summary
        })
        logging.warning('Succesfully grabbed through Goose.')
        logging.warning('Location: %s, Publication: %s' %
                        (location, publication))
        return output
    except:
        logging.critical('Unable to get article through Goose.')
        logging.critical('Location: %s, Publication: %s' %
                         (location, publication))
        return None
Esempio n. 16
0
    def parse(self, response, pr=None):
        for i in response.xpath('//a/@href').extract():
            if 'https://' in i or 'http://' in i:
                continue
            c = 0
            if re.match(r'(.*)\/(.*\.html)', response.url):
                c = 1
            urls = response.url.split("/")

            urls = '/'.join(urls[0:len(urls) - c - i.count("../")])
            if urls[-1] == '/':
                target = urls + i.split("../")[-1]
            else:
                target = urls + '/' + i.split("../")[-1]
            # print "target:", target
            if target == pr:
                continue
            yield scrapy.Request(target, callback=lambda res: self.parse(res, response.url))

        article = Goose().extract(raw_html=response.body)

        yield {
            "url": response.url,
            "article": article
        }
Esempio n. 17
0
 def getContent(self):
     g = Goose({'browser_user_agent': 'Mozilla', 'parser_class':'soup'});
     urls = self.getNodeLinks();
     for i, url in enumerate(urls):
         article = g.extract(url=url);
         self.writteFile(i, 'title', article.title);
         self.writteFile(i, 'article', article.cleaned_text);
Esempio n. 18
0
def get_text(article_url):
    goose = Goose()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor())
    response = opener.open(article_url)
    raw_html = response.read()
    article = goose.extract(raw_html=raw_html)
    return article.cleaned_text
Esempio n. 19
0
def HTMLParser(url):
    response = get(url)
    extractor = Goose()
    article = extractor.extract(raw_html=response.content)
    text = article.cleaned_text

    return str(text.encode("ascii", "ignore"))
Esempio n. 20
0
 def process_item(self, item, spider):
     if 'pdf_Link' in item:
         pdfName = item['report_name'] + u".pdf"
         PDFPath = os.path.join(PDF_PATH, item['source_name'])
         if not os.path.exists(PDFPath): os.makedirs(PDFPath)
         filepath = os.path.join(PDFPath, pdfName)
         try:
             content = self.downloadPDF(item['pdf_Link'], filepath)
             item["report_content"] = content
         except:
             self.jsonInfoStored(item, pdfName)
             log.msg(
                 "pdf download failure, information is serializing to json files",
                 level=log.INFO)
     elif 'content_Link' in item:
         from goose import Goose
         from goose.text import StopWordsChinese
         try:
             g = Goose({'stopwords_class': StopWordsChinese})
             article = g.extract(url=item['content_Link'])
             content = article.cleaned_text
             del item['content_Link']
             item["report_content"] = content
         except:
             log.msg("Content extracted failure from page:%s" %
                     item['report_link'],
                     level=log.INFO)
     return item
Esempio n. 21
0
def run(index):
    r = urllib2.urlopen(CNN_URL+index).read()
    soup = BeautifulSoup(r,"lxml")
    headlines = []
    links = []
    articles = []
    print ("Start reading news for: "+ index)
    for div in soup.find_all('table', 'wsod_newsTable')[0]:
        for col in div:
            link = col.find('a')['href']
            headline = col.find('a').contents[0]
            g = Goose()
            articles.append(g.extract(url=link).cleaned_text)


    print ("Finished Reading!")
    tokens = []

    for article in articles:
        tokens += word_tokenize(article)
    tokens = filter(lambda word: word not in string.punctuation, tokens)
    result = []
    for word in set(tokens):
        if tokens.count(word)>20 and tokens.count(word)<100:
            result.append((word, tokens.count(word)))

    return result
Esempio n. 22
0
class GetContentPipeline(object):
    goose = Goose({'stopwords_class': StopWordsChinese})

    def process_item(self, item, spider):
        if item:
            url = item.url
            new_content = NewsContent()
            new_content.news = item
            article = GetContentPipeline.goose.extract(url=url)

            if (not article) or not (article.top_node):
                item.delete()
                raise DropItem(u"无法获取内容 %s" % item)
            text = article.top_node.text_content()
            if not text:
                item.delete()
                raise DropItem(u"无法获取内容 %s" % item)

            content = etree.tostring(article.top_node)
            text = BeautifulSoup(content).getText()
            if len(text) < 100:
                item.delete()
                raise DropItem(u"获取内容太短 %s" % item)
            new_content.content = content
            try:
                img = article.top_image.src
                new_content.content_img = img
                movie = article.movies[0].src
                new_content.movie = movie
            except:
                pass
            new_content.save()
            return item
Esempio n. 23
0
def extract(url):
    '''
    提取网页正文
    '''
    g = Goose({'stopwords_class': StopWordsChinese})
    article = g.extract(url=url)
    return article.cleaned_text
Esempio n. 24
0
def parsepage(data, link):
    try:
        goo = Goose({'stopwords_class': StopWordsChinese})
        article = goo.extract(raw_html=data)
        return article
    except:
        traceback.print_exc()
Esempio n. 25
0
def parse_input(text, extractor='newspaper'):
    if isinstance(text, str) or isinstance(text, unicode):
        if text.startswith(('http://', 'https://')):
            # Input is a link - need to extract the text from html
            if extractor.lower() == 'goose':
                from goose import Goose
                urlparse = Goose()
                article = urlparse.extract(url=text)
                return unicode_to_ascii(article.cleaned_text)
            else:
                from newspaper import Article
                article = Article(text)
                article.download()
                article.parse()
                return unicode_to_ascii(article.text)
        elif text.endswith('.txt'):
            # Input is a file - need to read it
            textfile = open(text, 'rb')
            article = textfile.read()
            textfile.close()
            return unicode_to_ascii(article)
        else:
            # Input is a string containing the raw text
            return unicode_to_ascii(text)
    else:
        raise ValueError('Input text must be of type str or unicode.')
Esempio n. 26
0
def extract_body(html):
    """
	 Extract the body text of a web page
	"""
    g = Goose({'enable_image_fetching': False})
    article = g.extract(raw_html=html)
    return article.cleaned_text
Esempio n. 27
0
def print_news(url, content='title'):
    #parser = HtmlParser.from_url(url, Tokenizer(LANGUAGE))
    g = Goose()

    article = g.extract(url=url)

    #If there is a meta description available, print that else go for
    #summarize
    if content == 'full' and article.meta_description:
        print(article.meta_description)
        return

    news_text = article.cleaned_text

    parser = PlaintextParser.from_string(news_text, Tokenizer(LANGUAGE))
    stemmer = Stemmer(LANGUAGE)

    summarizer = Summarizer(stemmer)
    summarizer.stop_words = get_stop_words(LANGUAGE)

    if content == 'title' or content == 'full':
        #Print article title
        print('\t* ' + str(article.title.encode('ascii', 'ignore')))

    if content == 'full':
        #Print a n-line summary
        for sentence in summarizer(parser.document, SENTENCES_COUNT):
            print(sentence)
    return
Esempio n. 28
0
def SplitArticle(url):
    g = Goose({'stopwords_class': StopWordsChinese})
    article = g.extract(url=url)

    total_words = len(article.cleaned_text)
    current_word = 0
    last_sentence = ''
    sentences_pool = []
    while current_word < total_words:
        sub_article = last_sentence + article.cleaned_text[
            current_word:min(current_word + 100, total_words)]
        complete = (sub_article[-1] == u'。')
        sentences = sub_article.split(u'。')
        for s in range(len(sentences) - 1):
            for sub_s in sentences[s].split('\n'):
                if not sub_s == '':
                    sentences_pool.append(sub_s.encode('utf-8') + '\n')

        if not complete:
            last_sentence = sentences[-1]
        else:
            last_sentence = ''
            for sub_s in sentences[-1].split('\n'):
                if not sub_s == '':
                    sentences_pool.append(sub_s.encode('utf-8') + '\n')

        current_word = min(current_word + 100, total_words)

    return sentences_pool
Esempio n. 29
0
def get_parser(url, tokenizer):
    useragent = ' '.join([
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6)",
        "AppleWebKit/537.36 (KHTML, like Gecko)",
        "Chrome/52.0.2743.116 Safari/537.36"
    ])

    # Scrape Web Page With HTMLParser and Goose and select the best scrape
    html_parser = HtmlParser.from_url(url, tokenizer)
    article = Goose({'browser_user_agent': useragent})

    # Goose raises IndexError when requesting unfamiliar sites.
    try:
        extract = article.extract(url=url)
    except:
        extract = article.extract(raw_html=requests.get(url).text)

    goose_parser = PlaintextParser(extract, tokenizer)

    # Aggregate Site Metadata
    meta = {
        k: v
        for (k, v) in extract.infos.items()
        if k not in ('cleaned_text', 'links', 'tweets', 'movies')
    }
    # Select Best Parser
    parser = (
        html_parser
        if len(goose_parser.document.words) < len(html_parser.document.words)
        else  # noqa
        goose_parser)

    return parser, meta
Esempio n. 30
0
def goose_extractor(url):
    '''webpage extraction using
       Goose Library'''

    article = Goose().extract(url=url)
    return article.title, article.meta_description,\
                              article.cleaned_text