Ejemplo n.º 1
0
def generate():
    _db = Database("app.db")
    data_access = DataAccess(db=_db)
    data_access.generate_dataset("all_friends.csv")
    friends_list = pd.read_csv(get_output_file_path("all_friends.csv"))
    users = pd.read_csv(get_input_file_path("all_infected.csv"))
    user_ids = set(users['id'])
    friends_list['friends'] = friends_list.apply(lambda x: list(set(json.loads(x['friends'])) & user_ids), axis=1)
    friends_list.to_csv(get_output_file_path("filtered_friends.csv"), index=False, header=False, sep="\t")
def merge(ext_followers, source_candidates, out_file):
    print(f"Loading dataframe {ext_followers}")
    ext_followers_df = pd.read_csv(get_output_file_path(ext_followers),
                                   sep='\t')
    print(f"Loading dataframe {source_candidates}")
    source_candidates_df = pd.read_csv(get_output_file_path(source_candidates),
                                       sep='\t',
                                       names=["id", "source_candidates"])

    df = ext_followers_df.join(source_candidates_df.set_index('id'), on='id')
    df["followers_list"] = None
    print(f"saving dataframe as {out_file}")
    df.to_csv(get_output_file_path(out_file), index=False)
Ejemplo n.º 3
0
    def _merge(user_file, avg_file, out_file):
        user_df = pd.read_csv(get_input_file_path(user_file))
        nd_df = pd.read_csv(get_output_file_path(avg_file),
                            sep='\t',
                            names=["user_id", "avg_neighbour_degree"])

        df = user_df.join(nd_df.set_index('user_id'), on='user_id')
        print(f"saving dataframe as {out_file}")
        df.to_csv(get_output_file_path(out_file), index=False)
        df_filter = df.drop_duplicates(subset='avg_neighbour_degree',
                                       keep="last")
        df_filter = df_filter.sort_values(by=['avg_neighbour_degree'],
                                          ascending=False)
        df_filter.to_csv(get_output_file_path(f"filtered_{out_file}"),
                         index=False)
    def generate(self, in_file, out_file, start_time):
        sim_data = pd.read_csv(get_input_file_path(in_file))
        sim_data['infection_source'] = sim_data['source_candidates'].map(
            lambda x: self.__find(sim_data, x))
        out_columns = ['id', 'time_lapsed', 'infection_source']
        temp_df = sim_data[out_columns]
        sorted_df = temp_df.sort_values(by=['time_lapsed'])

        initial_nodes = set()
        initial_links = list()

        dynamic_nodes = set()
        dynamic_links = list()
        for _, row in sorted_df.iterrows():
            if row['time_lapsed'] <= start_time:
                initial_nodes.add(row['id'])
                if not np.isnan(row['infection_source']):
                    initial_nodes.add(row['infection_source'])
                    if not np.isnan(row['time_lapsed']):
                        initial_links.append({
                            "source": row['infection_source'],
                            "target": row['id']
                        })

            else:
                dynamic_nodes.add(row['id'])
                if not np.isnan(row['infection_source']):
                    dynamic_nodes.add(row['infection_source'])
                    if not np.isnan(row['time_lapsed']):
                        dynamic_links.append({
                            "source": row['infection_source'],
                            "target": row['id'],
                            "timeLapsed": row['time_lapsed']
                        })

        data = {
            "initialData": {
                "nodes":
                list(map(lambda x: {
                    "id": x,
                    "group": 1
                }, initial_nodes)),
                "links": initial_links
            },
            "dynamicData": {
                "nodes": list(map(lambda x: {"id": x}, dynamic_nodes)),
                "links": dynamic_links
            }
        }

        with open(get_output_file_path(out_file), 'w') as fp:
            json.dump(data, fp)


#generator = SimGraphDataGenerator()
#generator.generate("givenchy_simulation_result_6hrs_6_hrs_model_retrained.csv", "6hrs_sim_graph_retrained.json", 360.0)
Ejemplo n.º 5
0
def plot(fname, size, save = False):
    plt.title(fname)
    plt.tight_layout()
    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(size[0], size[1], forward=True)
    plt.rcParams['figure.figsize'] = size
    if save:
        plt.savefig("{}.pdf".format(get_output_file_path(fname)), dpi=600)
    plt.show()
    plt.close()
def create_filter_set(filter_file):

    filter_df = pd.read_csv(get_output_file_path(filter_file),
                            sep='\t',
                            names=["id", "followers"])

    id_set = set()
    for _, row in filter_df.iterrows():
        followers = set(map(lambda x: int(x), json.loads(row["followers"])))
        id_set.update(followers)
    return id_set
def json_to_csv(in_file, filter_file, out_file, event_day):

    filter_ids = create_filter_set(filter_file)

    with open(in_file, "r") as json_in:
        with open(get_output_file_path(out_file), "w") as csv_out:
            header = "\t".join(columns)
            csv_out.write(f"{header}\n")

            for line in tqdm(json_in, total=4417245):
                row = {}
                user = json.loads(line)
                if int(user["id"]) in filter_ids:

                    created_at = datetime.strptime(user["created_at"],
                                                   "%a %b %d %H:%M:%S %z %Y")
                    ucd = event_day - created_at
                    user_created_days = ucd.days if ucd.days > 0 else 1
                    row["id"] = user["id"]
                    row["created_at"] = created_at.strftime(
                        "%Y-%m-%d %H:%M:%S")
                    row["favourites_count"] = user["favourites_count"]
                    row["followers_count"] = user["followers_count"]
                    row["friends_count"] = user["friends_count"]
                    row["listed_count"] = user["listed_count"]
                    row["statuses_count"] = user["statuses_count"]
                    row["user_created_days"] = user_created_days
                    row['normalized_statuses_count'] = row[
                        'statuses_count'] / row['user_created_days']
                    row['normalized_followers_count'] = row[
                        'followers_count'] / row['user_created_days']
                    row['normalized_favourites_count'] = row[
                        'favourites_count'] / row['user_created_days']
                    row['normalized_listed_count'] = row['listed_count'] / row[
                        'user_created_days']
                    row['normalized_friends_count'] = row[
                        'friends_count'] / row['user_created_days']

                    row_collector = []
                    for col in columns:
                        row_collector.append(str(row[col]))
                    row = "\t".join(row_collector)
                    csv_out.write(f"{row}\n")
Ejemplo n.º 8
0
    def _compute_adv_degree(self):
        print("starting computation for avg neighbours degree")
        avg_file = f"avg_neighbour_degree_{self.adj_list_file}"
        with open(get_output_file_path(avg_file), 'w') as out:
            with open(get_input_file_path(self.adj_list_file), 'r') as tsvfile:
                reader = csv.reader(tsvfile, delimiter='\t')
                for row in reader:
                    source = row[0]
                    targets = json.loads(row[1])
                    neighbour_degree = 0
                    neighbour_count = 0
                    for t in targets:
                        if t in self.adj_list:
                            neighbour_degree += len(self.adj_list[t])
                            neighbour_count += 1

                    #neighbour_degree = neighbour_degree / len(targets)
                    if neighbour_count == 0:
                        neighbour_count = 1

                    avg_neighbour_degree = neighbour_degree / neighbour_count
                    out.write(f"{source}\t{avg_neighbour_degree}\n")

        return avg_file
Ejemplo n.º 9
0
 def generate_dataset(self, out_file):
     df = self.db.select_to_dataframe(self.SQL_ALL_USER)
     ensure_output_path()
     df.to_csv(get_output_file_path(out_file), index=False)
    unpickled_file = pickle.load(infile)
    print(f'Loaded {len(unpickled_file)} entries')
    infile.close()
    return unpickled_file


def save_pickle_file(path, data):
    print('Dumping data to path {}'.format(path))
    with open(path, 'wb') as file:
        pickle.dump(data, file)
    print('Finished dumping data to path {}'.format(path))


current_time = 420

users = load_pickle_file(get_output_file_path("nyc_users_6_9_infected.dat"))
ext_followers = pd.read_csv(get_output_file_path("nyc_6_9_ext_followers.csv"))


def id_to_index_list(idx_lookup, src_candidate_ids):
    return list(filter(lambda x: x is not None, map(lambda x: idx_lookup.get(x, None), src_candidate_ids)))


network_simulation = pd.DataFrame(columns=['id', 'time_lapsed', 'favourites_count', 'followers_count', 'friends_count',
                                           'listed_count', 'statuses_count', 'source_candidates', 'source_index',
                                           'seed_index', 'generation', 'time_since_seed', 'user_created_days',
                                           'normalized_statuses_count', 'normalized_followers_count',
                                           'normalized_favourites_count', 'normalized_listed_count',
                                           'normalized_friends_count'])

network_simulation['id']=users['id'].append(ext_followers['id'], ignore_index = True)