def get_non_retrieved(info_label): """ #@Argument: """ paths = file_loader.loadList(_output_path+"index.txt") current_user_index = pu.get_last_status(paths, log_path+"status.txt") i = 0 non_retrieved_list = [] for path in paths: label_found = False aux = 0 for line in open(path.lstrip("/").rstrip("\n")): #First line right after the label, therefore there is data (at least one!) if label_found == True: aux += 1 break if info_label in line: label_found = True if aux == 0: non_retrieved_list.append(path.rstrip("\n")) if i == current_user_index: break i = i + 1 print i, len(non_retrieved_list) file_loader.listToTXT(non_retrieved_list, _output_path+"non_retrieved.txt")
class CSVUtil(object): def __init__(self): self._fLoader = PyFileLoader() #@Arguments: #path = path string for a csv file #keyIndex = an interger with the column index to become the dictionary key, set as 0 by default, i.e.: first column of csv file #Return: A dictionary whose keys were set by keyIndex and the value is a list of dictionaries for the specified key def loadCSV(self, path, keyIndex=0): path = path.lstrip("/").rstrip("\n") collection = {} try: data = self._fLoader.loadPath(path) n = 0 header = "" for row in data: info = row.rstrip("\n").split(";") #Getting value as the key for the dictionary key = info.pop(keyIndex) #Reading the header if n == 0: header = info n = n + 1 #For the rest of the file else: single = {} i = 0 for col in header: single[col] = info[i] i = i+1 self._fLoader.pushDict(collection, key, single) except Exception, e: print e return collection
def get_all_lists(query, keyIndex = 3): """ #@Arguments #query = string to decide which kind of list must be retrived, i.e.: followers, friends, and etc.. #keyIndex = An integefer for screen_name, which we assume by default that is at position 3 #Return: None, it gets all values of given query and writes it on file #It is expected to have an index file with all users file_path """ paths = file_loader.loadList(_output_path+'index.txt') if paths == False: log.warning("No paths file to load") print "No paths file to load" sys.exit() current_user_index = pu.get_last_status(paths, log_path+"status.txt") print 'Starting at: ', paths[current_user_index].rstrip("\n") percentage = get_percentage(_output_path+'all-users.csv', query+"_count", current_user_index) last_user_index = len(paths) while current_user_index < last_user_index: #Loading from the account txt file which will create a dictionary at this point. user = pu.sort_dict(file_loader.loadCSV(paths[current_user_index], keyIndex)) screen_name = user.keys()[0] query_list = get_list(screen_name, query) percentage["collected_"+query+"_count"] += len(query_list) store_listed_info(query, query_list, screen_name, paths[current_user_index]) print "Status:", round(100*float(percentage["collected_"+query+"_count"])/percentage["total_"+query+"_count"], 2), "completed" current_user_index += 1
def get_list(screen_name, query): """ #@Arguments: #screen_name = a string with the user screen name #query = string to decide which kind of list must be retrived, i.e.: followers, friends, and etc.. #Return: list of ids for the given query or an empty list in case it is not possible, i.e. a protected account """ print 'Collect list of:'+screen_name+"'s "+query #Marking the current user current_user = file_loader.loadPath(log_path+"status.txt", 'w') current_user.write(screen_name) current_user.close() _list = [] while CURSOR != "0": url = "https://api.twitter.com/1.1/"+query+"/ids.json?cursor=" url += CURSOR+"&screen_name="+screen_name+"&count=5000" info = CREDENTIALS[CREDENTIAL_INDEX].request(url) data = check_limit(info, screen_name, 'list') if data is not False: if "ids" not in data: print "Something went wrong, data = ", data _list = _list + data["ids"] else: print 'Some problem with data, URL = ', url CURSOR = "-1" return _list
def store_listed_info(query, ids_list, screen_name, path): """ #@Arguments: #ids_list = a list of ids or an empty list #screen_name = a string with the user screen name #user_id = an integer with the user id #path = path for the file which contains user information and will hold all the followers #Return: None. Method saves information or stops program execution """ if ids_list is not []: file_loader.listToTXT(ids_list,path,'a+', '', '#'+query.capitalize()+"\n") else: print 'Unexpected error' log.warning("Unexpected error! Couldn't retrieve followers of "+screen_name) sys.exit() print len(ids_list), query.capitalize()+' for ', screen_name log.note("Information successfully retrieved: "+screen_name)
def get_all_accounts(csv_path, keyIndex): """ #@Arguments: #csv_path = Path string for the .csv file which contains all accounts information. #keyIndex = The integer representing the index position for screen_name in the csv file #op = mode for writing the output file, 'w' by default. If none found, a new one is created #Return: None, only writes information on file """ pu.set_unicode() #accounts = file_loader.loadCSV(csv_path, keyIndex) header = "group;subgroup;name;screen_name;lang;twitter_id;" header += "followers_count;listed_count;statuses_count;friends_count;favourites_count;" header += "url;created_at;time_zone\n" #Creating files which will contain information about all accounts and their file path which will contain the followers f_index = file_loader.loadPath(_output_path+'index.txt') f_csv = file_loader.loadPath(_output_path+'all-users.csv') f_csv.write(header) total = len(accounts) i = 0 #Iterating over the dictionary with all accounts for screen_name in accounts.keys(): p = Person() #If the user is labeled in more than one group/subgroup, it will be considered the last one user = accounts[screen_name].pop() p_path = _output_path+user['group']+'/'+user['subgroup']+'/'+screen_name+".txt" info = get_user_info(screen_name) #if twitter account exists, PS: user data is at info[0]: if info is not False: f_index.write(p_path+'\n') save_user_info(p, info[0], user['group'], user['subgroup'], header, p_path) f_csv.write(str(p)) i = i + 1 #Printing the progress on the screen print 'Status: ', str(round(100*float(i)/total,2))+'% completed' f_index.close() f_csv.close()
def save_user_info(p, info, user, header, p_path): """ #@Arguments: #p = person object #info = a list with data #group = string with user group #subgroup = string with user subgroup #header = string containing the columns for the csv file #p_path = path to the file to write this user info #Returns nothing, just write info on file p.loadFromJSON(info, group, subgroup) #Creates a file for each account and this file will contain all of this user followers """ f_p = file_loader.loadPath(p_path) f_p.write(header) f_p.write(str(p)) log.note("Retrieved: "+user) f_p.close()
def __init__(self): self._fLoader = PyFileLoader()