예제 #1
0
class RSCPaperSpider(scrapy.Spider):
    name = "RSC_Paper"

    http_user = '******'
    http_pass = '******'

    db = SynDevAdmin.db_access()
    db.connect()
    col = db.collection('RSC')

    def start_requests(self):
        for doc in self.col.find({'HTML_Crawled': False}):
            request = SplashRequest(doc['Article_HTML_Link'],
                                    self.parse,
                                    args={'wait': 2})
            request.meta['DOI'] = doc['DOI']
            yield request

    def parse(self, response):
        try:
            html = response.css('div#wrapper').extract_first()
            if html:
                self.col.update({"DOI": response.meta['DOI']}, {
                    '$set': {
                        'HTML_Crawled': True,
                        "Paper_Content_HTML": html
                    }
                })
            else:
                self.col.update({"DOI": response.meta['DOI']}, {
                    '$set': {
                        'HTML_Crawled': False,
                        'Error_Msg': "HTML string is None"
                    }
                })
        except Exception as e:
            self.col.update(
                {"DOI": response.meta['DOI']},
                {'$set': {
                    'HTML_Crawled': False,
                    'Error_Msg': str(e)
                }})
예제 #2
0
    import urlparse
except ImportError:
    import urllib.parse as urlparse

__author__ = 'Ziqin (Shaun) Rong'
__maintainer__ = 'Ziqin (Shaun) Rong'
__email__ = '*****@*****.**'

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument("-j",
                        type=str,
                        help="ECS journal name, one of EEL, JES, JSS, SSL")
    args = parser.parse_args()

    db = SynDevAdmin.db_access()
    db.connect()
    issue_col = db.collection('{}_issue'.format(args.j))

    for doc in issue_col.find({"Scraped": False}):
        res = requests.get(doc["URL"])
        if res.status_code == 200:
            soup = BeautifulSoup(res.content, 'lxml')
            abstract_links = []
            for article in soup.select('.toc-cit'):
                abstract_link = article.find_all('a', {'rel': 'abstract'})
                if abstract_link:
                    abstract_links.append(
                        urljoin(doc["URL"], abstract_link[0].get('href')))
            issue_col.update(
                {"_id": doc["_id"]},