def write_article(self):
     base_sql = "INSERT INTO article (id,url,newssource,crawled_at) VALUES "
     end_sql = " ON DUPLICATE KEY UPDATE url=url"
     sql = base_sql
     to_insert = ""
     if(self.message['data']==1L):
         return
     data = json.loads(self.message['data'])
     articles = data['articles']
     cleaned_newssource = str(data['newssource'].replace("\"","\\\"").replace("\'","\\\'").strip())
     count = 0
     for article in articles:
         cleaned_article = str(article.replace("\"","\\\"").replace("\'","\\\'").strip())
         key = str(hashlib.sha1(cleaned_article).hexdigest())
         crawled_at = str(int(time.time()))
         to_insert="(\""+key+"\",\""+cleaned_article+"\",\""+cleaned_newssource+"\","+crawled_at+"),"
         sql+=to_insert
         count+=1
         if(count%100==0):
             sql=sql[:-1]+end_sql
             mysql.write(sql, self.cursor, self.conn)
             sql = base_sql
     sql=sql[:-1]
     if(sql[-1] == ")"):
         sql=sql+end_sql
         mysql.write(sql, self.cursor, self.conn)
Example #2
0
    def fetch_json(self, offset=0):
        payload = {
            '$limit': self.limit,
            '$offset': offset,
            '$$app_token': self.socrata_key
        }

        try:
            r = requests.get(self.url, params=payload)
        except requests.exceptions.ChunkedEncodingError:
            print payload
            return self.fetch_json(offset=offset)

        to_save = [
            'latitude', 'longitude', 'id', 'primary_type', 'date',
            'x_coordinate', 'y_coordinate', 'district', 'ward',
            'community_area', 'fbi_code'
        ]
        print r.url
        if r.json():
            sql = "INSERT INTO dataset ("
            for i in to_save:
                sql += i + " , "
            sql = sql[:-2]
            sql += ") VALUES "
            for i in r.json():
                to_insert = "( "
                for j in to_save:
                    if j not in i.keys():
                        i[j] = "\'\'"
                    else:
                        if j == 'date':
                            i[j] = str(date_to_timestamp(i[j]))
                        i[j] = "\'" + i[j] + "\'"
                    to_insert += i[j] + ", "
                to_insert = to_insert[:-2]
                to_insert += '), '
                sql += to_insert
            sql = sql[:-2]
            db.write(sql, self.cursor, self.conn)
            return 1
        else:
            return 0
Example #3
0
    def fetch_json(self, offset=0):
        payload = {'$limit': self.limit, '$offset': offset, '$$app_token':self.socrata_key}
        
        try :
            r = requests.get(self.url, params=payload)
        except requests.exceptions.ChunkedEncodingError:
            print payload
            return self.fetch_json(offset = offset)

        to_save = ['latitude', 'longitude', 'id', 'primary_type','date', 'x_coordinate', 'y_coordinate', 
        'district', 'ward', 'community_area', 'fbi_code']
        print r.url
        if r.json():
            sql = "INSERT INTO dataset ("
            for i in to_save:
                sql+=i+" , "
            sql=sql[:-2]
            sql+= ") VALUES "
            for i in r.json():
                to_insert = "( "
                for j in to_save:
                    if j not in i.keys():
                        i[j] = "\'\'"
                    else:
                        if j == 'date':
                            i[j] = str(date_to_timestamp(i[j]))
                        i[j] = "\'"+i[j]+"\'"   
                    to_insert+=i[j]+", "
                to_insert = to_insert[:-2]
                to_insert+='), '
                sql+=to_insert
            sql = sql[:-2]
            db.write(sql, self.cursor, self.conn)
            return 1
        else:
            return 0
 def write_category(self):
     base_sql = "INSERT INTO category (id,url) VALUES "
     end_sql = " ON DUPLICATE KEY UPDATE url=url"
     sql = base_sql
     to_insert = ""
     if(self.message['data']==1L):
         return
     data = json.loads(self.message['data'])
     categories = data['categories']
     count = 0
     for category in categories:
         cleaned_category = str(category.replace("\"","\\\"").replace("\'","\\\'").strip())
         key = str(hashlib.sha1(cleaned_category).hexdigest())
         to_insert="(\""+key+"\",\""+cleaned_category+"\"),"
         sql+=to_insert
         count+=1
         if(count%100==0):
             sql=sql[:-1]+end_sql
             mysql.write(sql, self.cursor, self.conn)
             sql = base_sql
     sql=sql[:-1]
     if(sql[-1] == ")"):
         sql=sql+end_sql
         mysql.write(sql, self.cursor, self.conn)
Example #5
0
    def fetch_followers(self, user, depth=2):
        '''
        '''
        self.followers = defaultdict(list)
        self.follower_count = defaultdict(int)
        self.followers_list = []
        self.edge = []
        temp_list = [user]
        self.followers_list.append(temp_list)
        count = 0
        while count <= depth:
            temp_followers_list = []
            for user in self.followers_list[count]:
                if user not in self.followers:
                    print user
                    # https://api.github.com/users/shagunsodhani/followers?page=1&per_page=100
                    url = self.root_url + "/users/" + user + "/followers"
                    self.params['page'] = 1
                    response = requests.get(url, params=self.params)
                    r = response.json()
                    temp_list = []
                    while r:
                        for i in r:
                            login = str(i['login'])
                            temp_list.append(login)
                            temp_followers_list.append(login)
                            self.edge.append((user, login))
                        self.params['page'] += 1
                        response = requests.get(url, params=self.params)
                        r = response.json()
                    self.followers[user] = temp_list
                    self.follower_count[user] = len(temp_list)
                else:
                    for i in self.followers[user]:
                        temp_followers_list.append(i)
            self.followers_list.append(temp_followers_list)
            count += 1
        if self.options['log'] == 1:
            sql = "UPDATE followers SET is_deleted = 2 WHERE is_deleted = 0"
            db.write(sql, self.cursor, self.conn)

            sql_base = "INSERT INTO followers (user1, user2, is_deleted) VALUES "
            sql = sql_base
            sql_end = " ON DUPLICATE KEY UPDATE is_deleted=0"
            count = 1
            for i in self.followers:
                for j in self.followers[i]:
                    sql += "(\'" + i + "\', \'" + j + "\', 0), "
                    count += 1
                    if (count % 10000 == 0):
                        sql = sql[:-2]
                        sql += sql_end
                        db.write(sql, self.cursor, self.conn)
                        print count, " insertions completed."
                        sql = sql_base
            sql = sql[:-2]
            sql += sql_end
            db.write(sql, self.cursor, self.conn)
            print count - 1, " insertions completed."
            sql = "DELETE FROM followers WHERE is_deleted != 0"
            db.write(sql, self.cursor, self.conn)
Example #6
0
    def fetch_repo_fork(self, user, depth=2):
        '''
        '''
        # https://api.github.com/users/shagunsodhani/repos
        self.fork = defaultdict(list)
        self.fork_count = defaultdict(int)
        self.fork_list = []
        self.edge = []
        temp_list = [user]
        self.fork_list.append(temp_list)
        count = 0
        params = {}

        while count <= depth:
            temp_fork_list = []
            for user in self.fork_list[count]:
                if user not in self.fork:
                    print user
                    url = self.root_url + "/users/" + user + "/repos"
                    self.params['page'] = 1
                    response = requests.get(url, params=self.params)
                    r = response.json()
                    temp_list = []
                    while r:
                        for i in r:
                            print i['name']
                            url = self.root_url + "/repos/" + user + "/" + str(
                                i['name']) + "/forks"
                            params['access_token'] = self.token
                            params['page'] = 1
                            response = requests.get(url, params=params)
                            b = response.json()
                            if b:
                                # print response.url
                                # print b
                                for j in b:
                                    login = str(i['owner']['login'])
                                    # print login
                                    temp_list.append(login)
                                    temp_fork_list.append(login)
                                    self.edge.append((user, login))
                                    params['page'] += 1
                                    response = requests.get(url, params=params)
                                    b = response.json()
                        self.params['page'] += 1
                        response = requests.get(url, params=self.params)
                        r = response.json()
                    self.fork[user] = temp_list
                    self.fork_count[user] = len(temp_list)
                else:
                    for i in self.fork[user]:
                        temp_fork_list.append(i)
            self.fork_list.append(temp_fork_list)
            count += 1
        if self.options['log'] == 1:
            sql = "UPDATE fork SET is_deleted = 2 WHERE is_deleted = 0"
            db.write(sql, self.cursor, self.conn)

            sql_base = "INSERT INTO fork (user1, user2, is_deleted) VALUES "
            sql = sql_base
            sql_end = " ON DUPLICATE KEY UPDATE is_deleted=0"
            count = 1
            for i in self.fork:
                for j in self.fork[i]:
                    sql += "(\'" + i + "\', \'" + j + "\', 0), "
                    count += 1
                    if (count % 10000 == 0):
                        sql = sql[:-2]
                        sql += sql_end
                        db.write(sql, self.cursor, self.conn)
                        print count, " insertions completed."
                        sql = sql_base
            if (sql != sql_base):
                sql = sql[:-2]
                sql += sql_end
                db.write(sql, self.cursor, self.conn)
                print count - 1, " insertions completed."
            sql = "DELETE FROM fork WHERE is_deleted != 0"
            db.write(sql, self.cursor, self.conn)
Example #7
0
    def fetch_followers(self, user, depth = 2):
        '''
        '''
        self.followers = defaultdict(list)
        self.follower_count = defaultdict(int)
        self.followers_list = []
        self.edge = []
        temp_list = [user]
        self.followers_list.append(temp_list)
        count = 0
        while count <= depth:
            temp_followers_list = []
            for user in self.followers_list[count]:
                if user not in self.followers:
                    print user
                    # https://api.github.com/users/shagunsodhani/followers?page=1&per_page=100
                    url = self.root_url+"/users/"+user+"/followers"
                    self.params['page']=1
                    response = requests.get(url, params = self.params)
                    r = response.json()
                    temp_list = []
                    while r:
                        for i in r:
                            login = str(i['login'])
                            temp_list.append(login)
                            temp_followers_list.append(login)
                            self.edge.append((user, login))
                        self.params['page']+=1
                        response = requests.get(url, params = self.params)
                        r = response.json()
                    self.followers[user] = temp_list
                    self.follower_count[user] = len(temp_list)
                else:
                    for i in self.followers[user]:
                        temp_followers_list.append(i)
            self.followers_list.append(temp_followers_list)
            count+=1
        if self.options['log'] == 1:
            sql = "UPDATE followers SET is_deleted = 2 WHERE is_deleted = 0"
            db.write(sql, self.cursor, self.conn)

            sql_base = "INSERT INTO followers (user1, user2, is_deleted) VALUES "
            sql = sql_base
            sql_end = " ON DUPLICATE KEY UPDATE is_deleted=0"
            count = 1
            for i in self.followers:
                for j in self.followers[i]:
                    sql+="(\'"+i+"\', \'"+j+"\', 0), "
                    count+=1
                    if(count%10000==0):
                        sql = sql[:-2]
                        sql+=sql_end
                        db.write(sql, self.cursor, self.conn)
                        print count, " insertions completed."
                        sql = sql_base
            sql = sql[:-2]
            sql+=sql_end
            db.write(sql, self.cursor, self.conn)
            print count-1, " insertions completed."
            sql = "DELETE FROM followers WHERE is_deleted != 0"
            db.write(sql, self.cursor, self.conn)
Example #8
0
    def fetch_repo_fork(self, user, depth = 2):
        '''
        '''
         # https://api.github.com/users/shagunsodhani/repos
        self.fork = defaultdict(list)
        self.fork_count = defaultdict(int)
        self.fork_list = []
        self.edge = []
        temp_list = [user]
        self.fork_list.append(temp_list)
        count = 0
        params = {}

        while count <= depth:
            temp_fork_list = []
            for user in self.fork_list[count]:
                if user not in self.fork:
                    print user
                    url = self.root_url+"/users/"+user+"/repos"
                    self.params['page']=1
                    response = requests.get(url, params = self.params)
                    r = response.json()
                    temp_list = []
                    while r:
                        for i in r:
                            print i['name']
                            url = self.root_url+"/repos/"+user+"/"+str(i['name'])+"/forks"
                            params['access_token']=self.token
                            params['page']=1
                            response = requests.get(url, params = params)
                            b = response.json()
                            if b:
                                # print response.url
                                # print b
                                for j in b:
                                    login = str(i['owner']['login'])
                                    # print login
                                    temp_list.append(login)
                                    temp_fork_list.append(login)
                                    self.edge.append((user, login))
                                    params['page']+=1
                                    response = requests.get(url, params = params)
                                    b = response.json()
                        self.params['page']+=1
                        response = requests.get(url, params = self.params)
                        r = response.json()
                    self.fork[user] = temp_list
                    self.fork_count[user] = len(temp_list)
                else:
                    for i in self.fork[user]:
                        temp_fork_list.append(i)
            self.fork_list.append(temp_fork_list)
            count+=1
        if self.options['log'] == 1:
            sql = "UPDATE fork SET is_deleted = 2 WHERE is_deleted = 0"
            db.write(sql, self.cursor, self.conn)

            sql_base = "INSERT INTO fork (user1, user2, is_deleted) VALUES "
            sql = sql_base
            sql_end = " ON DUPLICATE KEY UPDATE is_deleted=0"
            count = 1
            for i in self.fork:
                for j in self.fork[i]:
                    sql+="(\'"+i+"\', \'"+j+"\', 0), "
                    count+=1
                    if(count%10000==0):
                        sql = sql[:-2]
                        sql+=sql_end
                        db.write(sql, self.cursor, self.conn)
                        print count, " insertions completed."
                        sql = sql_base
            if(sql != sql_base):
                sql = sql[:-2]
                sql+=sql_end
                db.write(sql, self.cursor, self.conn)
                print count-1, " insertions completed."
            sql = "DELETE FROM fork WHERE is_deleted != 0"
            db.write(sql, self.cursor, self.conn)