def write_article(self): base_sql = "INSERT INTO article (id,url,newssource,crawled_at) VALUES " end_sql = " ON DUPLICATE KEY UPDATE url=url" sql = base_sql to_insert = "" if(self.message['data']==1L): return data = json.loads(self.message['data']) articles = data['articles'] cleaned_newssource = str(data['newssource'].replace("\"","\\\"").replace("\'","\\\'").strip()) count = 0 for article in articles: cleaned_article = str(article.replace("\"","\\\"").replace("\'","\\\'").strip()) key = str(hashlib.sha1(cleaned_article).hexdigest()) crawled_at = str(int(time.time())) to_insert="(\""+key+"\",\""+cleaned_article+"\",\""+cleaned_newssource+"\","+crawled_at+")," sql+=to_insert count+=1 if(count%100==0): sql=sql[:-1]+end_sql mysql.write(sql, self.cursor, self.conn) sql = base_sql sql=sql[:-1] if(sql[-1] == ")"): sql=sql+end_sql mysql.write(sql, self.cursor, self.conn)
def fetch_json(self, offset=0): payload = { '$limit': self.limit, '$offset': offset, '$$app_token': self.socrata_key } try: r = requests.get(self.url, params=payload) except requests.exceptions.ChunkedEncodingError: print payload return self.fetch_json(offset=offset) to_save = [ 'latitude', 'longitude', 'id', 'primary_type', 'date', 'x_coordinate', 'y_coordinate', 'district', 'ward', 'community_area', 'fbi_code' ] print r.url if r.json(): sql = "INSERT INTO dataset (" for i in to_save: sql += i + " , " sql = sql[:-2] sql += ") VALUES " for i in r.json(): to_insert = "( " for j in to_save: if j not in i.keys(): i[j] = "\'\'" else: if j == 'date': i[j] = str(date_to_timestamp(i[j])) i[j] = "\'" + i[j] + "\'" to_insert += i[j] + ", " to_insert = to_insert[:-2] to_insert += '), ' sql += to_insert sql = sql[:-2] db.write(sql, self.cursor, self.conn) return 1 else: return 0
def fetch_json(self, offset=0): payload = {'$limit': self.limit, '$offset': offset, '$$app_token':self.socrata_key} try : r = requests.get(self.url, params=payload) except requests.exceptions.ChunkedEncodingError: print payload return self.fetch_json(offset = offset) to_save = ['latitude', 'longitude', 'id', 'primary_type','date', 'x_coordinate', 'y_coordinate', 'district', 'ward', 'community_area', 'fbi_code'] print r.url if r.json(): sql = "INSERT INTO dataset (" for i in to_save: sql+=i+" , " sql=sql[:-2] sql+= ") VALUES " for i in r.json(): to_insert = "( " for j in to_save: if j not in i.keys(): i[j] = "\'\'" else: if j == 'date': i[j] = str(date_to_timestamp(i[j])) i[j] = "\'"+i[j]+"\'" to_insert+=i[j]+", " to_insert = to_insert[:-2] to_insert+='), ' sql+=to_insert sql = sql[:-2] db.write(sql, self.cursor, self.conn) return 1 else: return 0
def write_category(self): base_sql = "INSERT INTO category (id,url) VALUES " end_sql = " ON DUPLICATE KEY UPDATE url=url" sql = base_sql to_insert = "" if(self.message['data']==1L): return data = json.loads(self.message['data']) categories = data['categories'] count = 0 for category in categories: cleaned_category = str(category.replace("\"","\\\"").replace("\'","\\\'").strip()) key = str(hashlib.sha1(cleaned_category).hexdigest()) to_insert="(\""+key+"\",\""+cleaned_category+"\")," sql+=to_insert count+=1 if(count%100==0): sql=sql[:-1]+end_sql mysql.write(sql, self.cursor, self.conn) sql = base_sql sql=sql[:-1] if(sql[-1] == ")"): sql=sql+end_sql mysql.write(sql, self.cursor, self.conn)
def fetch_followers(self, user, depth=2): ''' ''' self.followers = defaultdict(list) self.follower_count = defaultdict(int) self.followers_list = [] self.edge = [] temp_list = [user] self.followers_list.append(temp_list) count = 0 while count <= depth: temp_followers_list = [] for user in self.followers_list[count]: if user not in self.followers: print user # https://api.github.com/users/shagunsodhani/followers?page=1&per_page=100 url = self.root_url + "/users/" + user + "/followers" self.params['page'] = 1 response = requests.get(url, params=self.params) r = response.json() temp_list = [] while r: for i in r: login = str(i['login']) temp_list.append(login) temp_followers_list.append(login) self.edge.append((user, login)) self.params['page'] += 1 response = requests.get(url, params=self.params) r = response.json() self.followers[user] = temp_list self.follower_count[user] = len(temp_list) else: for i in self.followers[user]: temp_followers_list.append(i) self.followers_list.append(temp_followers_list) count += 1 if self.options['log'] == 1: sql = "UPDATE followers SET is_deleted = 2 WHERE is_deleted = 0" db.write(sql, self.cursor, self.conn) sql_base = "INSERT INTO followers (user1, user2, is_deleted) VALUES " sql = sql_base sql_end = " ON DUPLICATE KEY UPDATE is_deleted=0" count = 1 for i in self.followers: for j in self.followers[i]: sql += "(\'" + i + "\', \'" + j + "\', 0), " count += 1 if (count % 10000 == 0): sql = sql[:-2] sql += sql_end db.write(sql, self.cursor, self.conn) print count, " insertions completed." sql = sql_base sql = sql[:-2] sql += sql_end db.write(sql, self.cursor, self.conn) print count - 1, " insertions completed." sql = "DELETE FROM followers WHERE is_deleted != 0" db.write(sql, self.cursor, self.conn)
def fetch_repo_fork(self, user, depth=2): ''' ''' # https://api.github.com/users/shagunsodhani/repos self.fork = defaultdict(list) self.fork_count = defaultdict(int) self.fork_list = [] self.edge = [] temp_list = [user] self.fork_list.append(temp_list) count = 0 params = {} while count <= depth: temp_fork_list = [] for user in self.fork_list[count]: if user not in self.fork: print user url = self.root_url + "/users/" + user + "/repos" self.params['page'] = 1 response = requests.get(url, params=self.params) r = response.json() temp_list = [] while r: for i in r: print i['name'] url = self.root_url + "/repos/" + user + "/" + str( i['name']) + "/forks" params['access_token'] = self.token params['page'] = 1 response = requests.get(url, params=params) b = response.json() if b: # print response.url # print b for j in b: login = str(i['owner']['login']) # print login temp_list.append(login) temp_fork_list.append(login) self.edge.append((user, login)) params['page'] += 1 response = requests.get(url, params=params) b = response.json() self.params['page'] += 1 response = requests.get(url, params=self.params) r = response.json() self.fork[user] = temp_list self.fork_count[user] = len(temp_list) else: for i in self.fork[user]: temp_fork_list.append(i) self.fork_list.append(temp_fork_list) count += 1 if self.options['log'] == 1: sql = "UPDATE fork SET is_deleted = 2 WHERE is_deleted = 0" db.write(sql, self.cursor, self.conn) sql_base = "INSERT INTO fork (user1, user2, is_deleted) VALUES " sql = sql_base sql_end = " ON DUPLICATE KEY UPDATE is_deleted=0" count = 1 for i in self.fork: for j in self.fork[i]: sql += "(\'" + i + "\', \'" + j + "\', 0), " count += 1 if (count % 10000 == 0): sql = sql[:-2] sql += sql_end db.write(sql, self.cursor, self.conn) print count, " insertions completed." sql = sql_base if (sql != sql_base): sql = sql[:-2] sql += sql_end db.write(sql, self.cursor, self.conn) print count - 1, " insertions completed." sql = "DELETE FROM fork WHERE is_deleted != 0" db.write(sql, self.cursor, self.conn)
def fetch_followers(self, user, depth = 2): ''' ''' self.followers = defaultdict(list) self.follower_count = defaultdict(int) self.followers_list = [] self.edge = [] temp_list = [user] self.followers_list.append(temp_list) count = 0 while count <= depth: temp_followers_list = [] for user in self.followers_list[count]: if user not in self.followers: print user # https://api.github.com/users/shagunsodhani/followers?page=1&per_page=100 url = self.root_url+"/users/"+user+"/followers" self.params['page']=1 response = requests.get(url, params = self.params) r = response.json() temp_list = [] while r: for i in r: login = str(i['login']) temp_list.append(login) temp_followers_list.append(login) self.edge.append((user, login)) self.params['page']+=1 response = requests.get(url, params = self.params) r = response.json() self.followers[user] = temp_list self.follower_count[user] = len(temp_list) else: for i in self.followers[user]: temp_followers_list.append(i) self.followers_list.append(temp_followers_list) count+=1 if self.options['log'] == 1: sql = "UPDATE followers SET is_deleted = 2 WHERE is_deleted = 0" db.write(sql, self.cursor, self.conn) sql_base = "INSERT INTO followers (user1, user2, is_deleted) VALUES " sql = sql_base sql_end = " ON DUPLICATE KEY UPDATE is_deleted=0" count = 1 for i in self.followers: for j in self.followers[i]: sql+="(\'"+i+"\', \'"+j+"\', 0), " count+=1 if(count%10000==0): sql = sql[:-2] sql+=sql_end db.write(sql, self.cursor, self.conn) print count, " insertions completed." sql = sql_base sql = sql[:-2] sql+=sql_end db.write(sql, self.cursor, self.conn) print count-1, " insertions completed." sql = "DELETE FROM followers WHERE is_deleted != 0" db.write(sql, self.cursor, self.conn)
def fetch_repo_fork(self, user, depth = 2): ''' ''' # https://api.github.com/users/shagunsodhani/repos self.fork = defaultdict(list) self.fork_count = defaultdict(int) self.fork_list = [] self.edge = [] temp_list = [user] self.fork_list.append(temp_list) count = 0 params = {} while count <= depth: temp_fork_list = [] for user in self.fork_list[count]: if user not in self.fork: print user url = self.root_url+"/users/"+user+"/repos" self.params['page']=1 response = requests.get(url, params = self.params) r = response.json() temp_list = [] while r: for i in r: print i['name'] url = self.root_url+"/repos/"+user+"/"+str(i['name'])+"/forks" params['access_token']=self.token params['page']=1 response = requests.get(url, params = params) b = response.json() if b: # print response.url # print b for j in b: login = str(i['owner']['login']) # print login temp_list.append(login) temp_fork_list.append(login) self.edge.append((user, login)) params['page']+=1 response = requests.get(url, params = params) b = response.json() self.params['page']+=1 response = requests.get(url, params = self.params) r = response.json() self.fork[user] = temp_list self.fork_count[user] = len(temp_list) else: for i in self.fork[user]: temp_fork_list.append(i) self.fork_list.append(temp_fork_list) count+=1 if self.options['log'] == 1: sql = "UPDATE fork SET is_deleted = 2 WHERE is_deleted = 0" db.write(sql, self.cursor, self.conn) sql_base = "INSERT INTO fork (user1, user2, is_deleted) VALUES " sql = sql_base sql_end = " ON DUPLICATE KEY UPDATE is_deleted=0" count = 1 for i in self.fork: for j in self.fork[i]: sql+="(\'"+i+"\', \'"+j+"\', 0), " count+=1 if(count%10000==0): sql = sql[:-2] sql+=sql_end db.write(sql, self.cursor, self.conn) print count, " insertions completed." sql = sql_base if(sql != sql_base): sql = sql[:-2] sql+=sql_end db.write(sql, self.cursor, self.conn) print count-1, " insertions completed." sql = "DELETE FROM fork WHERE is_deleted != 0" db.write(sql, self.cursor, self.conn)