Python CSV.push Examples

Programming Language: Python

Namespace/Package Name: CSV

Class/Type: CSV

Method/Function: push

Examples at hotexamples.com: 1

Python CSV.push - 1 examples found. These are the top rated real world Python examples of CSV.CSV.push extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CSV(21)

create_empty_csv(5)

add_row(5)

get_number_of_rows(2)

get_folder_name(2)

get_current_file_name(2)

close_csv(2)

get_last_block(1)

sniffDialect(1)

reader2(1)

reader(1)

read_img(1)

read_chain(1)

read_all(1)

readFile(1)

push(1)

openFile(1)

get_table_name(1)

get_schema(1)

add_block(1)

add_data(1)

create_file(1)

get_entries(1)

get_data(1)

__init__(1)

get_chain_length(1)

get_all_img(1)

generate_csv(1)

export(1)

edit_entry(1)

dump(1)

detections(1)

delete_entry(1)

date_reader(1)

csv_write(1)

writeFile(1)

Example #1

Show file

File: Crawler.py Project: FrederickBernkastel/Ruffles

class Crawler:
    def __init__(self, depth, sampleSize, keyword):
        self.depth = depth
        self.contents = []
        self.crawlContents = []
        self.titles = []
        self.linksCrawled = []
        self.datecreated = {}
        self.keyword = keyword
        self.sampleSize = sampleSize
        self.voidedTitles = ["BBC Homepage", "Sign in"]
        self.results = {}

        self.knownlinks = [".mp4"]
        today = str(date.today())
        self.csvFile = CSV(keyword + "," + today + ".csv",
                           ["Word", "Tag", "Weight", "Link", "DateCreated"])

    def start(self):
        urlsToCrawl = []
        json = CustomJSONFormatter(self.keyword)
        articles = json.getValues("articles")

        for item in articles:
            urlsToCrawl.append(item["url"])
            self.contents.append(item["description"])
            self.titles.append(item["title"])
            self.datecreated[item["url"]] = item["publishedAt"]

        # for item in urlsToCrawl:
        # 	# print(item)
        # 	self.crawl(item, 0 , self.keyword, self.datecreated[item])
        print("")
        print("Links crawled:")

        posDict = {}

        # [(), ""] : weight
        for i in range(len(self.titles)):
            item = self.titles[i]
            url = urlsToCrawl[i]

            if "\\" not in item:
                tempDictForTitle = POS.POS(item)
                keys = tempDictForTitle.keys()

                for i in keys:
                    urlList = (i, url)
                    posDict[urlList] = tempDictForTitle[i]

        for i in range(len(self.contents)):
            title = self.titles[i]
            datecreated = self.datecreated[url]
            item2 = self.contents[i]
            url = urlsToCrawl[i]

            listOfKeysToUpdateUrl = []

            if item2 is not None:
                for node in item2:
                    if not isinstance(node, str):
                        sentence = node.find_all(text=True)
                        for minisentence in sentence:
                            tempDictForContents = POS.POS(minisentence, True)
                            keys = tempDictForContents.keys()
                            for i in keys:
                                listUrl = (i, url)
                                if listUrl not in posDict:
                                    posDict[listUrl] = tempDictForContents[i]
                                else:
                                    posDict[listUrl] = posDict[
                                        listUrl] + tempDictForContents[i]
                                listOfKeysToUpdateUrl.append(listUrl)
                            break

                    elif isinstance(node, str):
                        tempDictForContents = POS.POS(item2)
                        keys = tempDictForContents.keys()

                        for i in keys:
                            listUrl = (i, url)
                            if listUrl not in posDict:
                                posDict[listUrl] = tempDictForContents[i]
                            else:
                                posDict[listUrl] = posDict[
                                    listUrl] + tempDictForContents[i]
                            listOfKeysToUpdateUrl.append(listUrl)
                        break

        self.crawlContents = self.contents
        keys = posDict.keys()
        for i in keys:
            self.crawl(i[1], 0, self.keyword, posDict)

        for i in keys:
            print(i)
            print(posDict[i])
            print(self.datecreated[i[1]])

            newDict = {}
            newDict["Word"] = self.lowerCase(i[0][0])
            newDict["Tag"] = i[0][1]
            newDict["Weight"] = posDict[i]
            newDict["Link"] = i[1]

            date = self.datecreated[i[1]]
            dates = date.split("T")
            dates2 = dates[0].split("-")

            newDict[
                "DateCreated"] = dates2[2] + "/" + dates2[1] + "/" + dates2[0]

            self.csvFile.push(newDict)
        self.csvFile.save()

    def getContents(self):
        return self.contents

    def lowerCase(self, sentence):
        return sentence.lower()

    def crawl(self, link, depth, keyword, posDict):
        print("Depth of " + str(depth) + " : " + link)

        if link in self.linksCrawled:
            return

        for item in self.linksCrawled:
            if link[:100] == item[:100]:
                return

        if depth >= self.depth:
            return

        self.linksCrawled.append(link)
        scraper = Scraper(link, keyword)
        hyperLinkList = scraper.scrapeLinks(link)
        articleDictionary = scraper.scrape('p')

        print(articleDictionary)

        for item in articleDictionary:
            if item not in self.crawlContents:
                self.crawlContents.append(item)

                if isinstance(item, str):
                    tempDictForContents = POS.POS(item)
                    keys = tempDictForContents.keys()

                    for i in keys:
                        listUrl = (i, link)
                        if listUrl not in posDict:
                            posDict[listUrl] = tempDictForContents[i]
                        else:
                            posDict[listUrl] = posDict[
                                listUrl] + tempDictForContents[i]
                        print(item)

        if len(self.crawlContents) + len(
                self.contents) <= self.sampleSize and len(
                    articleDictionary) != 0 and len(
                        self.linksCrawled) <= self.sampleSize:
            self.linksCrawled.append(link)
            if hyperLinkList is not None:
                for link in hyperLinkList:
                    if link not in self.linksCrawled:
                        if "ad" not in link:
                            _thread.start_new_thread(self.crawl, (
                                link,
                                depth + 1,
                                self.keyword,
                                posDict,
                            ))

                            # self.crawl(link, depth + 1, self.keyword, posDict)

    def getAbsoluteLink(self, link):
        pathList = link.split("/")
        protocol = pathList[0]
        host = pathList[2]
        return protocol + "//" + host