def parse_project_campaigns(self):
     ksq_scraped = ksqlite("../Data/mlks_scraped.db")
     ksq_parsed = ksqlite("../Data/mlks_parsed.db")
     projects = ksq_scraped.select_all_project_ids_and_campaigns()
     count = 0
     for project in projects:
         soup = BeautifulSoup(project["campaign"], 'html.parser')
         x = self.load_json_from_campaign(soup)
         count = count + 1
         print(count)
         for key, value in self.flatten_dict(x).items():
             ksq_parsed.update_field("Projects", key, str(value),
                                     "project_id", project["project_id"])
 def parse_creator_biograpy(self):
     ksq_scraped = ksqlite("../Data/mlks_scraped.db")
     ksq_parsed = ksqlite("../Data/mlks_parsed.db")
     creators = ksq_scraped.select_all_creator_about()
     count = 0
     for creator in creators:
         soup = BeautifulSoup(creator["about"], 'html.parser')
         for biograpy in soup.findAll("p", {"class": "mb3"}):
             biograpy = ' '.join(
                 re.sub(r'[\t\r\n]', '', biograpy.text.strip()).split())
             ksq_parsed.update_field("Creators", "biograpy", biograpy,
                                     "creator_id", creator["creator_id"])
         count = count + 1
         print(count)
 def parse_creator_backed(self):
     ksq_scraped = ksqlite("../Data/mlks_scraped.db")
     ksq_parsed = ksqlite("../Data/mlks_parsed.db")
     creators = ksq_scraped.select_all_creator_about()
     count = 0
     for creator in creators:
         soup = BeautifulSoup(creator["about"], 'html.parser')
         for link in soup.find_all('a'):
             for link in soup.findAll("a", {"class": "js-backed-link"}):
                 for span in link.findAll("span", {"class": "count"}):
                     backed = span.string.strip()
                     ksq_parsed.update_field("Creators", "backed", backed,
                                             "creator_id",
                                             creator["creator_id"])
         count = count + 1
         print(count)
 def parse_and_insert_creator_id(self):
     ksq = ksqlite()
     db_connection = ksq.db_connection("../Data/mlks.db")
     projects = ksq.select_creator_is_null(db_connection)
     for project in projects:
         soup = BeautifulSoup(project["campaign"], 'html.parser')
         creator_id = self.load_json_from_campaign(soup)["creator"]["id"]
         ksq.update_creator_id(db_connection, creator_id, project["id"])
     db_connection.close()
Ejemplo n.º 5
0
 def get_campaign(self):
     ksql = ksqlite()
     db_connection = ksql.db_connection(ksql.abs_file_path("../Data/projects_courtney.db"))
     projects = ksql.select_campaign_is_null(db_connection)
     print(projects)
     for project in projects:
         url = self.campaign_url(project["id"], project["keywords"])
         print(url) #visual cl progress
         campaign = urllib.request.urlopen(url).read()
         ksql.update_campaign(db_connection, campaign, project["id"])
         time.sleep(5)
     db_connection.close()
 def parse_full_descriptions(self):
     ksq_scraped = ksqlite("../Data/mlks_scraped.db")
     ksq_parsed = ksqlite("../Data/mlks_parsed.db")
     projects = ksq_scraped.select_all_project_ids_and_campaigns()
     count = 0
     no_text = 0
     for project in projects:
         try:
             soup = BeautifulSoup(project["campaign"], 'html.parser')
             full_description = soup.find("div",
                                          {"class": "full-description"})
             full_description = ' '.join(
                 re.sub(r'[\t\r\n]', '', full_description.text).split())
             ksq_parsed.update_field("Projects", "full_description",
                                     full_description, "project_id",
                                     project["project_id"])
             count = count + 1
             print(count)
         except:
             no_text = no_text + 1
             print(no_text)
             pass
 def parse_creator_social(self):
     ksq_scraped = ksqlite("../Data/mlks_scraped.db")
     ksq_parsed = ksqlite("../Data/mlks_parsed.db")
     creators = ksq_scraped.select_all_creator_about()
     count = 0
     for creator in creators:
         soup = BeautifulSoup(creator["about"], 'html.parser')
         for link in soup.find_all('a'):
             link = link.get('href')
             if re.search('//www\.facebook\.com/', link):
                 ksq_parsed.update_field("Creators", "facebook", link,
                                         "creator_id",
                                         creator["creator_id"])
             if re.search('//www\.twitter\.com/', link):
                 ksq_parsed.update_field("Creators", "twitter", link,
                                         "creator_id",
                                         creator["creator_id"])
             if re.search('//www\.youtube\.com/', link):
                 ksq_parsed.update_field("Creators", "youtube", link,
                                         "creator_id",
                                         creator["creator_id"])
         count = count + 1
         print(count)
    def list_all_fields(self):
        ksq_scraped = ksqlite("../Data/mlks_scraped.db")
        campaigns = ksq_scraped.select_all_campaigns()
        fields = []
        count = 0
        for campaign in campaigns:
            soup = BeautifulSoup(campaign["campaign"], 'html.parser')
            json_dict = self.load_json_from_campaign(soup)
            for key, value in self.flatten_dict(json_dict).items():
                fields.append(key)
                fields = list(set(fields))
            count = count + 1
            print(count)
            print(len(fields))

        print(sorted(fields))
Ejemplo n.º 9
0
    def get_creator(self):

        def url_creator_about(creator_id):
            creator_url = "https://www.kickstarter.com/profile/" + creator_id + "/about"
            return creator_url

        ksql_scraped = ksqlite("../Data/mlks_scraped.db")
        creator_ids = ksql_scraped.select_creator_ids_where_about_is_null()
        count = len(creator_ids)
        for creator_id in creator_ids:
            url = url_creator_about(creator_id)
            print(url)
            count = count -1
            print(count)
            about = urllib.request.urlopen(url).read()
            ksql_scraped.update_field("Creators", "about", about, "creator_id", creator_id)
            time.sleep(1.5)