コード例 #1
0
def get_total_domains():
    sqlite = SQLite()
    total_links = sqlite.select_collected_domains()
    total = []
    for link in total_links:
        total.append(link[0])
    return total
コード例 #2
0
def get_sqlite_rsschannels():
    sqlite = SQLite()
    sqlite_rss_channels = []
    data = sqlite.select_rss_channels()
    for info in data:
        sqlite_rss_channels.append(
            RssChannel(url=info[0], domain=info[1], last_update=info[2]))
    return sqlite_rss_channels
コード例 #3
0
 def test_insert_rsschannels(self):
     rsschannel = RssChannel(url="https://akashjaindxb.com/about/feed/",cloud="{\"test\":12}",skipHours="[]",skipDays="[]",imagewidth=0,imageheight=0,pubDate=datetime.datetime.strptime("Wed, 29 Apr 2020 22:49:17 +0000","%a, %d %b %Y %H:%M:%S %z"))
     mysql = MySQL()
     mysql.insert_rss_channels([rsschannel])
     mysql.commit()
     sqlite = SQLite()
     sqlitersschannels = []
     for rssch in [rsschannel]:
         sqlitersschannels.append(RssChannel(url=rssch.url))
     sqlite.delete_rss_channels(rsschannels=sqlitersschannels)
コード例 #4
0
 def test_update_collected_domains_state(self):
     dmns = [
         Domain("https://nytimes.com"),
         Domain("https://twitter.com"),
         Domain("https://noidea.com")
     ]
     sqlite = SQLite("../../Storage/localdb.db")
     sqlite.update_collected_domains(dmns)
     sqlite.close()
コード例 #5
0
    def test_parse_internalwebsite(self):
        with open("./test_storage/internal.html","r",encoding="UTF-8" ,errors="igniore") as html:
            db = SQLite("../../Storage/localdb.db")
            html = html.read()
            response = scrapy.http.HtmlResponse(url="https://www.nytimes.com/section/politics",status=200,body=html,encoding="UTF-8")
            s = WebSitesSpider(start_urls=["https://nytimes.com"],total_external_links=["https://nytimes.com"])
            s.localdb = db

            domain = "nytimes.com"
            website_domaine = getDomaine(response.url)
            rss_channels_links = s.getWebSiteRssChannelsLinks(response)
            links = s.getWebSiteLinks(response)
            extrernalLinks = s.getExtrernalLinks(website_domaine,links)
            #________________
            s.save_sqlite_external_links(extrernalLinks)
            s.save_sqlite_rss_channels(domain,rss_channels_links)
            print("INERNAL WEBISTE SCRAPE DATA : ",{"domaine":website_domaine,"url":response.url,"rsschannelslinks":rss_channels_links,"externalLinks":extrernalLinks,"containerwebsiteinfoDOAMIN":domain})
コード例 #6
0
    def test_parsewebsite(self):
        with open("./test_storage/domain.html","r",encoding="UTF-8" ,errors="igniore") as html:
            db = SQLite("../../Storage/localdb.db")
            html = html.read()
            response = scrapy.http.HtmlResponse(url="https://nytimes.com",status=200,body=html,encoding="UTF-8")
            s = WebSitesSpider(start_urls=["https://nytimes.com"],total_external_links=["https://nytimes.com"])
            s.localdb = db

            website_domaine = getDomaine(response.url)
            domain = s.get_domain(response)
            website_rss_links = s.getWebSiteRssChannelsLinks(response)
            links = s.getWebSiteLinks(response)
            externalLinks = s.getExtrernalLinks(domaine=website_domaine,total_links=links)
            internalLinks = s.getWebSiteInternalLinks(domaine=website_domaine,total_links=links)
            #_________________
            #s.save_mysql_domains([domain])
            s.update_collected_domains([getBaseUrl(response.url)])
            #s.save_sqlite_external_links(externalLinks)
            #s.save_sqlite_rss_channels(website_domaine,website_rss_links)

            #print("DOMAIN SCRAPED DATA : ",{"infos":domain._object(),"rsslinks":website_rss_links,"internalLinks":internalLinks,"externalLinks":externalLinks})
            #_________________
            print("info", {"infos":domain._object(),"rsslinks":website_rss_links,"internalLinks":internalLinks,"externalLinks":externalLinks})
コード例 #7
0
def get_total_domains():
    sqlite = SQLite()
    return sqlite.select_collected_domains()
コード例 #8
0
def get_waiting_domains():
    sqlite = SQLite()
    return sqlite.select_collected_domains(state=0)
コード例 #9
0
 def test_sqlite_init(self):
     sqlite = SQLite("../../Storage/localdb.db")
     self.assertNotEqual(sqlite.connexion, None,
                         "FAILED CRFEATING SQLITE CONNEXION")
     sqlite.close()
     self.assertEqual(sqlite.connexion, None, "FAILED CLOSING DB")
コード例 #10
0
 def test_select_rss_channels(self):
     sqlite = SQLite("../../Storage/localdb.db")
     print(sqlite.select_rss_channels())
     sqlite.close()
コード例 #11
0
 def test_select_domains(self):
     sqlite = SQLite("../../Storage/localdb.db")
     print(sqlite.select_collected_domains(state=2))
     sqlite.close()
コード例 #12
0
 def test_insert_rss_channels_links(self):
     rch = RssChannel("hellodomain", "domain_test")
     sqlite = SQLite("../../Storage/localdb.db")
     sqlite.insert_rss_channels([rch])
     sqlite.close()
コード例 #13
0
 def test_insert_domains(self):
     d = Domain("domain_test")
     sqlite = SQLite("../../Storage/localdb.db")
     sqlite.insert_collected_domains([d])
     sqlite.close()
コード例 #14
0
class WebSitesSpider(scrapy.Spider):
    name = "websites_spider"
    start_urls = []
    allowed_domains = []
    handle_httpstatus_list = [404, 500]
    localdb = SQLite()
    total_rss_channels_links = []
    total_external_links = []

    def __init__(self, start_urls, total_external_links):
        self.start_urls = start_urls
        self.total_external_links = total_external_links
        self.setup_allowed_domains()

    def start_requests(self):
        try:
            print("START SCRAPING WEBSITES urls : ", self.start_urls)
            self.setup_allowed_domains()
            for url in self.start_urls:
                print("REQUESTING TO DOMAIN URL : ", url, " .........")
                yield scrapy.Request(url=url,
                                     callback=self.parse_website,
                                     dont_filter=True,
                                     encoding="UTF-8")
                sleep(5)
        except Exception:
            traceback.print_exc()

    def parse_website(self, response):
        website_domaine = getDomaine(response.url)
        websiteurl = getBaseUrl(response.url)
        if response.status in (404, 500):
            self.update_collected_domains([websiteurl], status=4)
            print("---- DOMAIN NOT FOUND -> ", websiteurl)
            return None
        try:
            print("EXTRACTING DOMAIN INFO ....")

            domain = self.get_domain(response)
            website_rss_links = self.getWebSiteRssChannelsLinks(response)
            links = self.getWebSiteLinks(response)
            externalLinks = self.getExtrernalLinks(domaine=website_domaine,
                                                   total_links=links)
            internalLinks = self.getWebSiteInternalLinks(
                domaine=website_domaine, total_links=links)
            #_________________
            print("-- SAVING EXTRACTED DOMAIN DATA ....")
            self.save_mysql_domains([domain])
            self.update_collected_domains([websiteurl], status=1)
            self.save_sqlite_external_links(externalLinks)
            self.save_sqlite_rss_channels(getBaseUrl(response.url),
                                          website_rss_links)
            #_________________
            if internalLinks == [] and externalLinks == [] and website_rss_links == []:
                self.update_collected_domains([websiteurl], status=3)
                return
            else:
                print("FETCHUING DOMAIN INTERNAL LINKS....")
                for internalLink in internalLinks:
                    if self.allowed_domains.count(
                            getDomaine(internalLink)) == 0:
                        self.allowed_domains.append(getDomaine(internalLink))
                    yield Request(internalLink,
                                  callback=self.parse_internalwebsite,
                                  dont_filter=True,
                                  meta={"domain": domain},
                                  encoding="UTF-8")
                    sleep(2)
            self.update_collected_domains([websiteurl], status=2)
        except Exception:
            traceback.print_exc()

    def parse_internalwebsite(self, response):
        print("-- FETCHING AND SAVING DATA FOR INTERNAL LINK  ",
              response.url + " ......")
        try:
            domain = response.meta.get("domain")
            website_domaine = getDomaine(response.url)
            rss_channels_links = self.getWebSiteRssChannelsLinks(response)
            links = self.getWebSiteLinks(response)
            extrernalLinks = self.getExtrernalLinks(website_domaine, links)
            #________________
            self.save_sqlite_external_links(extrernalLinks)
            self.save_sqlite_rss_channels(domain.url, rss_channels_links)
            #________________
        except Exception:
            traceback.print_exc()

    def save_mysql_domains(self, domains):
        mysql = MySQL()
        mysql.insert_domains(domains)
        mysql.close()

    def save_sqlite_external_links(self, external_links):
        if external_links == []: return
        collected_domains = []
        for external_link in external_links:
            collected_domains.append(items.Domain(external_link))
        self.localdb.open()
        self.localdb.insert_collected_domains(collected_domains)
        self.localdb.close()

    def save_sqlite_rss_channels(
        self,
        domain,
        rss_channels_links: Iterable = [],
    ):
        if rss_channels_links == []: return None
        rss_channels = []
        for rss_channel_link in rss_channels_links:
            rss_channels.append(items.RssChannel(rss_channel_link, domain))
        self.localdb.open()
        self.localdb.insert_rss_channels(rss_channels)
        self.localdb.close()

    def update_collected_domains(
        self,
        domainslinks,
        status: int = 1,
        last_update: datetime = datetime.datetime.today()):
        args = []
        for domain in domainslinks:
            args.append(
                DataBase.items.Domain(domain=domain,
                                      last_update=last_update,
                                      state=status))
        self.localdb.open()
        self.localdb.update_collected_domains(args)
        self.localdb.close()

#___________________________ HELPERS ___________________________________________

    def getWebSiteRssChannelsLinks(self, response):
        rssChannelsLinksNodes = response.css(
            "link[type='application/rss+xml']::attr('href')")
        filtratedRssChannelsLinks = []
        for rssChannelLinkNode in rssChannelsLinksNodes:
            rssChannelLink = rssChannelLinkNode.extract()
            if self.total_rss_channels_links.count(
                    rssChannelLink) == 0 and getDomaine(
                        response.url) == getDomaine(rssChannelLink):
                filtratedRssChannelsLinks.append(rssChannelLink)
                self.total_rss_channels_links.append(rssChannelLink)
        return filtratedRssChannelsLinks

    def getWebSiteLinks(self, response):
        linksNodes = response.css("body *[href^='https']::attr('href')")
        links = []
        for linkNode in linksNodes:
            link = linkNode.extract()
            if links.count(link) == 0: links.append(link)
        return links

    def getWebSiteInternalLinks(self,
                                domaine,
                                total_links: Iterable = [],
                                response: None = None):
        if total_links is None: total_links = self.getWebSiteLinks(response)
        internal_links = []
        for link in total_links:
            if internal_links.count(link) == 0 and getDomaine(
                    link) == domaine and domaine != link:
                internal_links.append(link)
        return internal_links[0:10]

    def getExtrernalLinks(self,
                          domaine,
                          total_links: Iterable = [],
                          response: None = None):
        if total_links is None: total_links = self.getWebSiteLinks(response)
        external_links = []
        for link in total_links:
            if external_links.count(
                    getBaseUrl(link)) == 0 and self.total_external_links.count(
                        getBaseUrl(link)) == 0 and getDomaine(link) != domaine:
                external_links.append(getBaseUrl(link))
                self.total_external_links.append(getBaseUrl(link))
        return external_links

    def get_domain(self, response):
        websiteName = self.select_css(
            response, "meta[name='application-name']::attr('content')")
        websiteType = self.select_css(
            response, "meta[property='og:type']::attr('content')")
        websiteImage = self.select_css(
            response, "meta[property='og:image']::attr('content')")
        websiteKeyWords = self.select_css(
            response, "meta[name='keywords']::attr('content')")
        websiteIconUrl = self.select_css(
            response, "link[rel='shortcut icon']::attr('href')")
        websiteLanguage = self.select_css(
            response, "meta[itemprop='inLanguage']::attr('content')")
        websiteRobots = self.select_css(
            response, "meta[name='robots']::attr('content')")
        websiteStartUrl = self.select_css(
            response, "meta[name='msapplication-starturl']::attr('content')")

        websiteDescription = self.select_css(
            response, "meta[name='description']::attr('content')")
        if websiteDescription == "":
            websiteDescription = self.select_css(
                response, "meta[property='og:description']::attr('content')")

        websiteTitle = self.select_css(
            response, "meta[property='og:title']::attr('content')")
        if websiteTitle == "":
            websiteTitle = self.select_css(response, "title::text")

        return Domaine(url=getBaseUrl(response.url),
                       name=websiteName,
                       description=websiteDescription,
                       roletype=websiteType,
                       title=websiteTitle,
                       image=websiteImage,
                       keywords=websiteKeyWords,
                       language=websiteLanguage,
                       robots=websiteRobots,
                       icon=websiteIconUrl,
                       starturl=websiteStartUrl)

    def select_css(self, selector, css):
        node = selector.css(css)
        if node:
            return node.extract_first()
        else:
            return ""

    def setup_allowed_domains(self, urls: Iterable = None):
        if urls is None: urls = self.start_urls
        self.allowed_domains.clear()
        for url in urls:
            self.allowed_domains.append(str(parse.urlparse(url).hostname))
コード例 #15
0
 def test_openlocaldb(self):
     s = SQLite("../../Storage/localdb.db")
     self.assertNotEqual(s.connexion,None,"FAILED OPEN DB")