Esempio n. 1
0
def get_all_lists(query, keyIndex = 3):
    """
    #@Arguments
    #query = string to decide which kind of list must be retrived, i.e.: followers, friends, and etc..
    #keyIndex = An integefer for screen_name, which we assume by default that is at position 3
    #Return: None, it gets all values of given query and writes it on file    
    #It is expected to have an index file with all users file_path
    """
    paths = file_loader.loadList(_output_path+'index.txt')
    if paths == False:
        log.warning("No paths file to load")
        print "No paths file to load"
        sys.exit()
    current_user_index = pu.get_last_status(paths, log_path+"status.txt")
    print 'Starting at: ', paths[current_user_index].rstrip("\n")
    percentage = get_percentage(_output_path+'all-users.csv', query+"_count", current_user_index)
    last_user_index = len(paths)
    while current_user_index < last_user_index:
        #Loading from the account txt file which will create a dictionary at this point. 
        user = pu.sort_dict(file_loader.loadCSV(paths[current_user_index], keyIndex))
        screen_name = user.keys()[0]
        query_list = get_list(screen_name, query)
        percentage["collected_"+query+"_count"] += len(query_list)
        store_listed_info(query, query_list, screen_name, paths[current_user_index])
        print "Status:", round(100*float(percentage["collected_"+query+"_count"])/percentage["total_"+query+"_count"], 2), "completed"
        current_user_index += 1
Esempio n. 2
0
def get_non_retrieved(info_label):
    """
    #@Argument:
    """
    paths = file_loader.loadList(_output_path+"index.txt")
    current_user_index = pu.get_last_status(paths, log_path+"status.txt")
    i = 0
    non_retrieved_list = []
    for path in paths:
        label_found = False
        aux = 0
        for line in open(path.lstrip("/").rstrip("\n")):
        #First line right after the label, therefore there is data (at least one!)
            if label_found == True:
                aux += 1
                break
            if info_label in line:
                label_found = True
        if aux == 0:
            non_retrieved_list.append(path.rstrip("\n"))
        if i == current_user_index:
            break
        i = i + 1
    print i, len(non_retrieved_list)
    file_loader.listToTXT(non_retrieved_list, _output_path+"non_retrieved.txt")
Esempio n. 3
0
def get_all_accounts(csv_path, keyIndex):
    """
    #@Arguments: 
    #csv_path = Path string for the .csv file which contains all accounts information. 
    #keyIndex = The integer representing the index position for screen_name in the csv file
    #op = mode for writing the output file, 'w' by default. If none found, a new one is created
    #Return: None, only writes information on file
    """
    pu.set_unicode()
    #accounts = file_loader.loadCSV(csv_path, keyIndex)
    header = "group;subgroup;name;screen_name;lang;twitter_id;"
    header += "followers_count;listed_count;statuses_count;friends_count;favourites_count;"
    header += "url;created_at;time_zone\n"     
    #Creating files which will contain information about all accounts and their file path which will contain the followers
    f_index = file_loader.loadPath(_output_path+'index.txt')
    f_csv = file_loader.loadPath(_output_path+'all-users.csv')
    f_csv.write(header)
    total = len(accounts)
    i = 0
    #Iterating over the dictionary with all accounts
    for screen_name in accounts.keys():
        p = Person()
        #If the user is labeled in more than one group/subgroup, it will be considered the last one
        user = accounts[screen_name].pop()
        p_path = _output_path+user['group']+'/'+user['subgroup']+'/'+screen_name+".txt"
        info = get_user_info(screen_name)
        #if twitter account exists, PS: user data is at info[0]:
        if info is not False:
            f_index.write(p_path+'\n')
            save_user_info(p, info[0], user['group'], user['subgroup'], header, p_path)
            f_csv.write(str(p))
        i = i + 1
        #Printing the progress on the screen
        print 'Status: ', str(round(100*float(i)/total,2))+'% completed'
    f_index.close()
    f_csv.close()