Esempio n. 1
0
def get_non_retrieved(info_label):
    """
    #@Argument:
    """
    paths = file_loader.loadList(_output_path+"index.txt")
    current_user_index = pu.get_last_status(paths, log_path+"status.txt")
    i = 0
    non_retrieved_list = []
    for path in paths:
        label_found = False
        aux = 0
        for line in open(path.lstrip("/").rstrip("\n")):
        #First line right after the label, therefore there is data (at least one!)
            if label_found == True:
                aux += 1
                break
            if info_label in line:
                label_found = True
        if aux == 0:
            non_retrieved_list.append(path.rstrip("\n"))
        if i == current_user_index:
            break
        i = i + 1
    print i, len(non_retrieved_list)
    file_loader.listToTXT(non_retrieved_list, _output_path+"non_retrieved.txt")
Esempio n. 2
0
class CSVUtil(object):
  
  def __init__(self):
    self._fLoader = PyFileLoader()
  #@Arguments:
    #path = path string for a csv file 
    #keyIndex = an interger with the column index to become the dictionary key, set as 0 by default, i.e.: first column of csv file
  #Return: A dictionary whose keys were set by keyIndex and the value is a list of dictionaries for the specified key
  def loadCSV(self, path, keyIndex=0):
    path = path.lstrip("/").rstrip("\n")
    collection = {}
    try:
      data = self._fLoader.loadPath(path)
      n = 0
      header = ""
      for row in data:
        info = row.rstrip("\n").split(";")
        #Getting value as the key for the dictionary
        key = info.pop(keyIndex)     
        #Reading the header
        if n == 0:
          header = info
          n = n + 1
        #For the rest of the file
        else:
          single = {}
          i = 0
          for col in header:
            single[col] = info[i]
            i = i+1
          self._fLoader.pushDict(collection, key, single)
    except Exception, e:
      print e
    return collection
Esempio n. 3
0
def get_all_lists(query, keyIndex = 3):
    """
    #@Arguments
    #query = string to decide which kind of list must be retrived, i.e.: followers, friends, and etc..
    #keyIndex = An integefer for screen_name, which we assume by default that is at position 3
    #Return: None, it gets all values of given query and writes it on file    
    #It is expected to have an index file with all users file_path
    """
    paths = file_loader.loadList(_output_path+'index.txt')
    if paths == False:
        log.warning("No paths file to load")
        print "No paths file to load"
        sys.exit()
    current_user_index = pu.get_last_status(paths, log_path+"status.txt")
    print 'Starting at: ', paths[current_user_index].rstrip("\n")
    percentage = get_percentage(_output_path+'all-users.csv', query+"_count", current_user_index)
    last_user_index = len(paths)
    while current_user_index < last_user_index:
        #Loading from the account txt file which will create a dictionary at this point. 
        user = pu.sort_dict(file_loader.loadCSV(paths[current_user_index], keyIndex))
        screen_name = user.keys()[0]
        query_list = get_list(screen_name, query)
        percentage["collected_"+query+"_count"] += len(query_list)
        store_listed_info(query, query_list, screen_name, paths[current_user_index])
        print "Status:", round(100*float(percentage["collected_"+query+"_count"])/percentage["total_"+query+"_count"], 2), "completed"
        current_user_index += 1
Esempio n. 4
0
def get_list(screen_name, query):
    """
    #@Arguments:
    #screen_name = a string with the user screen name 
    #query = string to decide which kind of list must be retrived, i.e.: followers, friends, and etc..
    #Return: list of ids for the given query or an empty list in case it is not possible, i.e. a protected account
    """
    print 'Collect list of:'+screen_name+"'s "+query
    #Marking the current user
    current_user = file_loader.loadPath(log_path+"status.txt", 'w')
    current_user.write(screen_name)
    current_user.close()
    _list = []
    while CURSOR != "0":
        url = "https://api.twitter.com/1.1/"+query+"/ids.json?cursor="
        url += CURSOR+"&screen_name="+screen_name+"&count=5000"
        info = CREDENTIALS[CREDENTIAL_INDEX].request(url)
        data = check_limit(info, screen_name, 'list')
        if data is not False:
            if "ids" not in data:
                print "Something went wrong, data = ", data
            _list = _list + data["ids"]
        else:
            print 'Some problem with data, URL = ', url
    CURSOR = "-1"
    return _list
Esempio n. 5
0
def store_listed_info(query, ids_list, screen_name, path):
    """
    #@Arguments:
    #ids_list = a list of ids or an empty list
    #screen_name = a string with the user screen name 
    #user_id = an integer with the user id
    #path = path for the file which contains user information and will hold all the followers
    #Return: None. Method saves information or stops program execution
    """
    if ids_list is not []:
        file_loader.listToTXT(ids_list,path,'a+', '', '#'+query.capitalize()+"\n")
    else:
        print 'Unexpected error'
        log.warning("Unexpected error! Couldn't retrieve followers of "+screen_name)
        sys.exit()
    print len(ids_list), query.capitalize()+' for ', screen_name
    log.note("Information successfully retrieved: "+screen_name)
Esempio n. 6
0
def get_all_accounts(csv_path, keyIndex):
    """
    #@Arguments: 
    #csv_path = Path string for the .csv file which contains all accounts information. 
    #keyIndex = The integer representing the index position for screen_name in the csv file
    #op = mode for writing the output file, 'w' by default. If none found, a new one is created
    #Return: None, only writes information on file
    """
    pu.set_unicode()
    #accounts = file_loader.loadCSV(csv_path, keyIndex)
    header = "group;subgroup;name;screen_name;lang;twitter_id;"
    header += "followers_count;listed_count;statuses_count;friends_count;favourites_count;"
    header += "url;created_at;time_zone\n"     
    #Creating files which will contain information about all accounts and their file path which will contain the followers
    f_index = file_loader.loadPath(_output_path+'index.txt')
    f_csv = file_loader.loadPath(_output_path+'all-users.csv')
    f_csv.write(header)
    total = len(accounts)
    i = 0
    #Iterating over the dictionary with all accounts
    for screen_name in accounts.keys():
        p = Person()
        #If the user is labeled in more than one group/subgroup, it will be considered the last one
        user = accounts[screen_name].pop()
        p_path = _output_path+user['group']+'/'+user['subgroup']+'/'+screen_name+".txt"
        info = get_user_info(screen_name)
        #if twitter account exists, PS: user data is at info[0]:
        if info is not False:
            f_index.write(p_path+'\n')
            save_user_info(p, info[0], user['group'], user['subgroup'], header, p_path)
            f_csv.write(str(p))
        i = i + 1
        #Printing the progress on the screen
        print 'Status: ', str(round(100*float(i)/total,2))+'% completed'
    f_index.close()
    f_csv.close()
Esempio n. 7
0
def save_user_info(p, info, user, header, p_path):
    """
    #@Arguments:
    #p = person object
    #info = a list with data
    #group = string with user group
    #subgroup = string with user subgroup
    #header = string containing the columns for the csv file
    #p_path = path to the file to write this user info 
    #Returns nothing, just write info on file
    p.loadFromJSON(info, group, subgroup)
    #Creates a file for each account and this file will contain all of this user followers
    """
    f_p = file_loader.loadPath(p_path)
    f_p.write(header)        
    f_p.write(str(p))
    log.note("Retrieved: "+user)
    f_p.close()
Esempio n. 8
0
 def __init__(self):
   self._fLoader = PyFileLoader()