Ejemplo n.º 1
0
 def process_wayback_url(self, response):
     """
     Process Wayback URL and marks as Wayback Accessible for Conference if successful
     """
     conf_id = response.meta['conf_id']
     DatabaseHelper.mark_accessibility(conf_id, "Wayback Accessible",
                                       DB_FILEPATH)
Ejemplo n.º 2
0
 def parse(self, response):
     """
     Parses conference homepage and determines whether found URLs are valid for further crawling
     """
     conf_id = response.meta['conf_id']
     content_type = get_content_type(response)
     DatabaseHelper.mark_crawled(conf_id, DB_FILEPATH)
     self.add_conf_page(conf_id, response)
     if content_type != 'pdf':
         # Crawl relevant links
         for url in get_relevant_urls(response, self.driver):
             if get_url_status(url) != 200:
                 DatabaseHelper.add_page(
                     ConferencePage(conf_id=conf_id,
                                    url=url,
                                    html="",
                                    content_type="Inaccessible"),
                     DB_FILEPATH)
             elif not DatabaseHelper.page_saved(url, DB_FILEPATH):
                 yield Request(url=url,
                               dont_filter=True,
                               meta={'conf_id': conf_id},
                               callback=self.parse_aux_conf_page,
                               errback=self.parse_page_error)
             else:
                 pass
Ejemplo n.º 3
0
 def start_requests(self):
     """
     Get all Conference Homepage URLs from database and yields scrapy Requests
     - Use wayback url if original url does not indicate year of conference
     """
     DatabaseHelper.create_db(DB_FILEPATH)
     conn = sqlite3.connect(str(DB_FILEPATH))
     cur = conn.cursor()
     confs = cur.execute(
         "SELECT * FROM WikicfpConferences WHERE crawled='No'").fetchall()
     cur.close()
     conn.close()
     for conf in confs:
         conf_id, url, wayback_url, accessibility = conf[0], conf[3], conf[
             6], conf[8]
         # Two consecutive digits usually indicates conference year in url
         if accessibility == "Accessible URL" and re.search('\d{2}', url):
             access_url = url
         elif wayback_url != "Not Available":
             access_url = wayback_url
         else:
             continue
         yield Request(url=access_url,
                       dont_filter=True,
                       meta={'conf_id': conf_id},
                       callback=self.parse,
                       errback=self.parse_page_error,
                       headers=REQUEST_HEADERS)
Ejemplo n.º 4
0
 def process_conference_url(self, conf_url: str, conf_id: int, wayback_url):
     """
     Check if conference url is accessible, else checks availability on Waybackmachine Archive
     """
     # Metadata in case of request error
     meta = {'conf_id': conf_id}
     # Set arbitrary browser agent in header since certain sites block against crawlers
     try:
         if get_url_status(conf_url) == 200:
             DatabaseHelper.mark_accessibility(conf_id, "Accessible URL",
                                               DB_FILEPATH)
     except Exception as e:
         DatabaseHelper.mark_accessibility(conf_id, e.__class__.__name__,
                                           DB_FILEPATH)
         return scrapy.spiders.Request(url=wayback_url,
                                       dont_filter=True,
                                       meta=meta,
                                       callback=self.process_wayback_url)
Ejemplo n.º 5
0
 def add_conf_page(self, conf_id: int, response: 'Response'):
     """
     Adds Conference Page to database
     """
     content_type = get_content_type(response)
     if content_type == 'pdf':
         page_id = DatabaseHelper.add_page(
             ConferencePage(conf_id=conf_id,
                            url=response.url,
                            html="",
                            content_type=content_type,
                            processed="No"), DB_FILEPATH)
     else:
         self.driver.get(response.url)
         time.sleep(3)  # Ensure javascript loads
         page_html = self.driver.page_source
         # Add Conference Homepage to database
         page_id = DatabaseHelper.add_page(
             ConferencePage(conf_id=conf_id,
                            url=response.url,
                            html=page_html,
                            content_type=content_type,
                            processed="No"), DB_FILEPATH)
Ejemplo n.º 6
0
    def process_wikiconf(self, response):
        """
        Process individual conference page within wikicfp
            - Parse conference page and save basic conference info to database

        Returns link of conference page to facilitate crawling
        """
        parsed_conference: WikiConferenceItem = WikiConfParser.parse_conf(
            response)
        conf_id = DatabaseHelper.add_wikicfp_conf(parsed_conference,
                                                  DB_FILEPATH)
        url = parsed_conference['url']
        if url:  # Check accessibilty of both direct URL and WaybackMachine URL
            return self.process_conference_url(
                url, conf_id, parsed_conference['wayback_url'])
Ejemplo n.º 7
0
from cfp_crawl.cfp_spider.spiders.base_wikicfp_spider import BaseCfpSpider
from cfp_crawl.cfp_spider.spiders.wikicfp_all_spider import WikicfpAllSpider
from cfp_crawl.cfp_spider.spiders.wikicfp_latest_spider import WikicfpLatestSpider
from cfp_crawl.cfp_spider.spiders.conf_crawl import ConferenceCrawlSpider

parser = argparse.ArgumentParser(description='')
parser.add_argument('crawler', type=str, help="Specifies crawler type")
args = parser.parse_args()
crawl_type = args.crawler

# Start crawl
process = CrawlerProcess(settings={})
spider_type = {
    'wikicfp_all': WikicfpAllSpider,
    'wikicfp_latest': WikicfpLatestSpider,
    'conf_crawl': ConferenceCrawlSpider,
}
if crawl_type not in spider_type.keys():
    print("Unspecified crawl type")
    print("Usage:\n\t python crawl <crawler_type>\n\t\
        'wikicfp_all': WikicfpAllSpider\n\t\
        'wikicfp_latest': WikicfpLatestSpider\n\t\
        'conf_crawl': ConferenceCrawlSpider"
    )

else:
    if crawl_type == 'wikicfp_all' or crawl_type == 'wikicfp_latest':
        DatabaseHelper.create_db(DB_FILEPATH)  # Create necessary DB tables
    process.crawl(spider_type[crawl_type])
    process.start()