Beispiel #1
0
    def __init__(self, conf=None):

        self.conf = conf

        self.start_urls = [
            'https://www2.ed.gov/about/offices/list/oela/index.html'
        ]

        # Make rules
        self.rules = [
            Rule(
                LinkExtractor(
                    allow=self.allowed_regex,
                    deny=[
                        "\\" + regex
                        for regex in h.get_data_extensions().keys()
                    ],
                    #restrict_xpaths='//div[@id="maincontent"]'
                ),
                callback=parse,
                follow=True),
        ]

        # Inherit parent
        super(Crawler, self).__init__()
Beispiel #2
0
    def __init__(self):

        self.start_urls = [
            'https://nces.ed.gov/datatools/',
            'https://ies.ed.gov/data.asp',
            'https://nces.ed.gov/pubsearch/index.asp?PubSectionID=1&HasSearched=1&pubspagenum=1&sort=3&order=0&L1=&L2=&searchstring=&searchtype=AND&searchcat2=&searchcat=title&pagesize=15&searchmonth=3&searchyear=2018&datetype=ge&pubtype=010&surveyname=&surveyid=&centername=NCES&center=NCES',
            'https://nces.ed.gov/Datalab/TablesLibrary',
            #'https://nces.ed.gov/pubs2009/expenditures/tables/table_08.asp?referrer=report'
            #'https://nces.ed.gov/surveys/els2002/tables/APexams_01.asp'
            #'https://nces.ed.gov/ipeds/deltacostproject/'
            #'https://nces.ed.gov/pubs2009/expenditures/tables.asp'
        ]

        # Make rules
        self.rules = [
            Rule(
                LinkExtractor(
                    allow=self.allowed_regex,
                    deny=[
                        "\\" + regex
                        for regex in h.get_data_extensions().keys()
                    ],
                    # restrict_xpaths='//*[@id="maincontent"]'
                    # process_value=lambda value: value.replace('http', 'https', 1),
                ),
                callback=parse,
                follow=True),
        ]

        # Inherit parent
        super(Crawler, self).__init__()
Beispiel #3
0
    def __init__(self):

        self.start_urls = [
            'https://dashboard.ed.gov/',
        ]

        extensions_to_avoid = []
        for ext in [
                h.get_data_extensions(),
                h.get_document_extensions(),
                h.get_avoidable_extensions()
        ]:
            extensions_to_avoid.extend(ext.keys())

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=self.allowed_regex,
                deny_extensions=[ext[1:] for ext in extensions_to_avoid],
                process_value=self.process_value,
                unique=True,
                deny_domains=h.retrieve_crawlers_allowed_domains(
                    except_crawlers=['edgov'])),
                 callback=parse,
                 follow=True),
        ]

        # Inherit parent
        super(Crawler, self).__init__()
Beispiel #4
0
    def __init__(self):

        self.start_urls = [
            'http://rems.ed.gov/',
            'https://rems.ed.gov/REMSPublications.aspx',
            'https://rems.ed.gov/#resources',
        ]

        extensions_to_avoid = []
        for ext in [
                h.get_data_extensions(),
                h.get_document_extensions(),
                h.get_avoidable_extensions()
        ]:
            extensions_to_avoid.extend(ext.keys())

        # Make rules
        self.rules = [
            Rule(
                LinkExtractor(
                    #allow_domains=self.allowed_domains,
                    #allow=self.allowed_regex,
                    #deny_extensions=[ext[1:] for ext in extensions_to_avoid],
                    process_value=self.process_value,
                    unique=True,
                ),
                callback=parse,
                follow=True,
                process_links='process_links',
                process_request='process_request'),
        ]

        # Inherit parent
        super(Crawler, self).__init__()
Beispiel #5
0
    def __init__(self):

        self.start_urls = [
            'https://www2.ed.gov/finaid/prof/resources/data/teach-institution.html',
            'https://www2.ed.gov/',
            'https://www2.ed.gov/about/offices/list/index.html'
        ]

        extensions_to_avoid = []
        for ext in [
                h.get_data_extensions(),
                h.get_document_extensions(),
                h.get_avoidable_extensions()
        ]:
            extensions_to_avoid.extend(ext.keys())

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=self.allowed_regex,
                deny_extensions=[ext[1:] for ext in extensions_to_avoid],
                deny_domains=h.retrieve_crawlers_allowed_domains(
                    except_crawlers=['edgov'])),
                 callback=parse,
                 follow=True),
        ]

        # Inherit parent
        super(Crawler, self).__init__()
Beispiel #6
0
    def __init__(self):

        self.start_urls = [
            # Get all records since 1980
            'https://ies.ed.gov/pubsearch/index.asp?searchyear=1980'
            # Imported from the nces parser
            'https://nces.ed.gov/datatools/',
            'https://ies.ed.gov/data.asp',
            'https://nces.ed.gov/pubsearch/index.asp',
            'https://nces.ed.gov/Datalab/TablesLibrary',
            #'https://nces.ed.gov/pubs2009/expenditures/tables/table_08.asp?referrer=report'
            #'https://nces.ed.gov/surveys/els2002/tables/APexams_01.asp'
            #'https://nces.ed.gov/ipeds/deltacostproject/'
            #'https://nces.ed.gov/pubs2009/expenditures/tables.asp'
        ]

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=self.allowed_regex,
                deny=["\\" + regex for regex in h.get_data_extensions().keys()],
                # restrict_xpaths='//*[@id="maincontent"]'
                # process_value=lambda value: value.replace('http', 'https', 1),
            ), callback=parse, follow=True),
        ]


        # Inherit parent
        super(Crawler, self).__init__()
Beispiel #7
0
    def __init__(self, conf=None):

        self.conf = conf

        self.start_urls = [
            'https://www2.ed.gov/about/offices/list/osers/index.html',
            'https://www2.ed.gov/about/offices/list/osers/products/employmentguide/index.html'
        ]

        # Make rules
        self.rules = [
            Rule(
                LinkExtractor(allow=self.allowed_regex,
                              deny_extensions=[
                                  regex[1:]
                                  for regex in h.get_data_extensions().keys()
                              ],
                              allow_domains=Crawler.allowed_domains
                              #restrict_xpaths='//*[@id="maincontent"]'
                              ),
                callback=parse,
                follow=True),
        ]

        # Inherit parent
        super(Crawler, self).__init__()
Beispiel #8
0
def resource_checker(tag_attr: str):
    """ function is used as a filter for BeautifulSoup to
    locate resource (i.e. DATA_EXTENSIONS) files"""

    if tag_attr != '' and tag_attr is not None:
        for extension in h.get_data_extensions().keys():
            if tag_attr.lower().endswith(f'{extension}') and\
                (tag_attr[tag_attr.rfind('/')+1:].lower() not in deny_list):
                return True
        # if code gets here, no resources found
        return False
    # tag_attr does not match resource required, so return False
    return False
Beispiel #9
0
    def __init__(self):

        self.start_urls = [
            'https://www2.ed.gov/finaid/prof/resources/data/teach-institution.html',
            'https://www2.ed.gov/',
            'https://www2.ed.gov/about/offices/list/index.html'
        ]

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                #allow=self.allowed_regex,
                deny_extensions=[regex[1:] for regex in h.get_data_extensions().keys()],
                deny_domains=h.retrieve_crawlers_allowed_domains(except_crawlers=['edgov'])
                #restrict_xpaths='//*[@id="maincontent"]',
            ), callback=parse, follow=True),
        ]


        # Inherit parent
        super(Crawler, self).__init__()
Beispiel #10
0
    def __init__(self):

        self.start_urls = ['https://sites.ed.gov/']

        extensions_to_avoid = []
        for ext in [
                h.get_data_extensions(),
                h.get_document_extensions(),
                h.get_avoidable_extensions()
        ]:
            extensions_to_avoid.extend(ext.keys())

        # Make rules
        self.rules = [
            Rule(LinkExtractor(
                allow=self.allowed_regex,
                deny_extensions=[ext[1:] for ext in extensions_to_avoid],
            ),
                 callback=parse,
                 follow=True),
        ]

        # Inherit parent
        super(Crawler, self).__init__()