def parse_project_campaigns(self): ksq_scraped = ksqlite("../Data/mlks_scraped.db") ksq_parsed = ksqlite("../Data/mlks_parsed.db") projects = ksq_scraped.select_all_project_ids_and_campaigns() count = 0 for project in projects: soup = BeautifulSoup(project["campaign"], 'html.parser') x = self.load_json_from_campaign(soup) count = count + 1 print(count) for key, value in self.flatten_dict(x).items(): ksq_parsed.update_field("Projects", key, str(value), "project_id", project["project_id"])
def parse_creator_biograpy(self): ksq_scraped = ksqlite("../Data/mlks_scraped.db") ksq_parsed = ksqlite("../Data/mlks_parsed.db") creators = ksq_scraped.select_all_creator_about() count = 0 for creator in creators: soup = BeautifulSoup(creator["about"], 'html.parser') for biograpy in soup.findAll("p", {"class": "mb3"}): biograpy = ' '.join( re.sub(r'[\t\r\n]', '', biograpy.text.strip()).split()) ksq_parsed.update_field("Creators", "biograpy", biograpy, "creator_id", creator["creator_id"]) count = count + 1 print(count)
def parse_creator_backed(self): ksq_scraped = ksqlite("../Data/mlks_scraped.db") ksq_parsed = ksqlite("../Data/mlks_parsed.db") creators = ksq_scraped.select_all_creator_about() count = 0 for creator in creators: soup = BeautifulSoup(creator["about"], 'html.parser') for link in soup.find_all('a'): for link in soup.findAll("a", {"class": "js-backed-link"}): for span in link.findAll("span", {"class": "count"}): backed = span.string.strip() ksq_parsed.update_field("Creators", "backed", backed, "creator_id", creator["creator_id"]) count = count + 1 print(count)
def parse_and_insert_creator_id(self): ksq = ksqlite() db_connection = ksq.db_connection("../Data/mlks.db") projects = ksq.select_creator_is_null(db_connection) for project in projects: soup = BeautifulSoup(project["campaign"], 'html.parser') creator_id = self.load_json_from_campaign(soup)["creator"]["id"] ksq.update_creator_id(db_connection, creator_id, project["id"]) db_connection.close()
def get_campaign(self): ksql = ksqlite() db_connection = ksql.db_connection(ksql.abs_file_path("../Data/projects_courtney.db")) projects = ksql.select_campaign_is_null(db_connection) print(projects) for project in projects: url = self.campaign_url(project["id"], project["keywords"]) print(url) #visual cl progress campaign = urllib.request.urlopen(url).read() ksql.update_campaign(db_connection, campaign, project["id"]) time.sleep(5) db_connection.close()
def parse_full_descriptions(self): ksq_scraped = ksqlite("../Data/mlks_scraped.db") ksq_parsed = ksqlite("../Data/mlks_parsed.db") projects = ksq_scraped.select_all_project_ids_and_campaigns() count = 0 no_text = 0 for project in projects: try: soup = BeautifulSoup(project["campaign"], 'html.parser') full_description = soup.find("div", {"class": "full-description"}) full_description = ' '.join( re.sub(r'[\t\r\n]', '', full_description.text).split()) ksq_parsed.update_field("Projects", "full_description", full_description, "project_id", project["project_id"]) count = count + 1 print(count) except: no_text = no_text + 1 print(no_text) pass
def parse_creator_social(self): ksq_scraped = ksqlite("../Data/mlks_scraped.db") ksq_parsed = ksqlite("../Data/mlks_parsed.db") creators = ksq_scraped.select_all_creator_about() count = 0 for creator in creators: soup = BeautifulSoup(creator["about"], 'html.parser') for link in soup.find_all('a'): link = link.get('href') if re.search('//www\.facebook\.com/', link): ksq_parsed.update_field("Creators", "facebook", link, "creator_id", creator["creator_id"]) if re.search('//www\.twitter\.com/', link): ksq_parsed.update_field("Creators", "twitter", link, "creator_id", creator["creator_id"]) if re.search('//www\.youtube\.com/', link): ksq_parsed.update_field("Creators", "youtube", link, "creator_id", creator["creator_id"]) count = count + 1 print(count)
def list_all_fields(self): ksq_scraped = ksqlite("../Data/mlks_scraped.db") campaigns = ksq_scraped.select_all_campaigns() fields = [] count = 0 for campaign in campaigns: soup = BeautifulSoup(campaign["campaign"], 'html.parser') json_dict = self.load_json_from_campaign(soup) for key, value in self.flatten_dict(json_dict).items(): fields.append(key) fields = list(set(fields)) count = count + 1 print(count) print(len(fields)) print(sorted(fields))
def get_creator(self): def url_creator_about(creator_id): creator_url = "https://www.kickstarter.com/profile/" + creator_id + "/about" return creator_url ksql_scraped = ksqlite("../Data/mlks_scraped.db") creator_ids = ksql_scraped.select_creator_ids_where_about_is_null() count = len(creator_ids) for creator_id in creator_ids: url = url_creator_about(creator_id) print(url) count = count -1 print(count) about = urllib.request.urlopen(url).read() ksql_scraped.update_field("Creators", "about", about, "creator_id", creator_id) time.sleep(1.5)