Example #1
0
class Crawler:
    def __init__(self, depth, sampleSize, keyword):
        self.depth = depth
        self.contents = []
        self.crawlContents = []
        self.titles = []
        self.linksCrawled = []
        self.datecreated = {}
        self.keyword = keyword
        self.sampleSize = sampleSize
        self.voidedTitles = ["BBC Homepage", "Sign in"]
        self.results = {}

        self.knownlinks = [".mp4"]
        today = str(date.today())
        self.csvFile = CSV(keyword + "," + today + ".csv",
                           ["Word", "Tag", "Weight", "Link", "DateCreated"])

    def start(self):
        urlsToCrawl = []
        json = CustomJSONFormatter(self.keyword)
        articles = json.getValues("articles")

        for item in articles:
            urlsToCrawl.append(item["url"])
            self.contents.append(item["description"])
            self.titles.append(item["title"])
            self.datecreated[item["url"]] = item["publishedAt"]

        # for item in urlsToCrawl:
        # 	# print(item)
        # 	self.crawl(item, 0 , self.keyword, self.datecreated[item])
        print("")
        print("Links crawled:")

        posDict = {}

        # [(), ""] : weight
        for i in range(len(self.titles)):
            item = self.titles[i]
            url = urlsToCrawl[i]

            if "\\" not in item:
                tempDictForTitle = POS.POS(item)
                keys = tempDictForTitle.keys()

                for i in keys:
                    urlList = (i, url)
                    posDict[urlList] = tempDictForTitle[i]

        for i in range(len(self.contents)):
            title = self.titles[i]
            datecreated = self.datecreated[url]
            item2 = self.contents[i]
            url = urlsToCrawl[i]

            listOfKeysToUpdateUrl = []

            if item2 is not None:
                for node in item2:
                    if not isinstance(node, str):
                        sentence = node.find_all(text=True)
                        for minisentence in sentence:
                            tempDictForContents = POS.POS(minisentence, True)
                            keys = tempDictForContents.keys()
                            for i in keys:
                                listUrl = (i, url)
                                if listUrl not in posDict:
                                    posDict[listUrl] = tempDictForContents[i]
                                else:
                                    posDict[listUrl] = posDict[
                                        listUrl] + tempDictForContents[i]
                                listOfKeysToUpdateUrl.append(listUrl)
                            break

                    elif isinstance(node, str):
                        tempDictForContents = POS.POS(item2)
                        keys = tempDictForContents.keys()

                        for i in keys:
                            listUrl = (i, url)
                            if listUrl not in posDict:
                                posDict[listUrl] = tempDictForContents[i]
                            else:
                                posDict[listUrl] = posDict[
                                    listUrl] + tempDictForContents[i]
                            listOfKeysToUpdateUrl.append(listUrl)
                        break

        self.crawlContents = self.contents
        keys = posDict.keys()
        for i in keys:
            self.crawl(i[1], 0, self.keyword, posDict)

        for i in keys:
            print(i)
            print(posDict[i])
            print(self.datecreated[i[1]])

            newDict = {}
            newDict["Word"] = self.lowerCase(i[0][0])
            newDict["Tag"] = i[0][1]
            newDict["Weight"] = posDict[i]
            newDict["Link"] = i[1]

            date = self.datecreated[i[1]]
            dates = date.split("T")
            dates2 = dates[0].split("-")

            newDict[
                "DateCreated"] = dates2[2] + "/" + dates2[1] + "/" + dates2[0]

            self.csvFile.push(newDict)
        self.csvFile.save()

    def getContents(self):
        return self.contents

    def lowerCase(self, sentence):
        return sentence.lower()

    def crawl(self, link, depth, keyword, posDict):
        print("Depth of " + str(depth) + " : " + link)

        if link in self.linksCrawled:
            return

        for item in self.linksCrawled:
            if link[:100] == item[:100]:
                return

        if depth >= self.depth:
            return

        self.linksCrawled.append(link)
        scraper = Scraper(link, keyword)
        hyperLinkList = scraper.scrapeLinks(link)
        articleDictionary = scraper.scrape('p')

        print(articleDictionary)

        for item in articleDictionary:
            if item not in self.crawlContents:
                self.crawlContents.append(item)

                if isinstance(item, str):
                    tempDictForContents = POS.POS(item)
                    keys = tempDictForContents.keys()

                    for i in keys:
                        listUrl = (i, link)
                        if listUrl not in posDict:
                            posDict[listUrl] = tempDictForContents[i]
                        else:
                            posDict[listUrl] = posDict[
                                listUrl] + tempDictForContents[i]
                        print(item)

        if len(self.crawlContents) + len(
                self.contents) <= self.sampleSize and len(
                    articleDictionary) != 0 and len(
                        self.linksCrawled) <= self.sampleSize:
            self.linksCrawled.append(link)
            if hyperLinkList is not None:
                for link in hyperLinkList:
                    if link not in self.linksCrawled:
                        if "ad" not in link:
                            _thread.start_new_thread(self.crawl, (
                                link,
                                depth + 1,
                                self.keyword,
                                posDict,
                            ))

                            # self.crawl(link, depth + 1, self.keyword, posDict)

    def getAbsoluteLink(self, link):
        pathList = link.split("/")
        protocol = pathList[0]
        host = pathList[2]
        return protocol + "//" + host