def create_graph(self, node_edge_pairs=[], filepath=None, graph_type='digraph'): if len(node_edge_pairs) == 0: start_time = time.time() data_loader = DataLoader( filepath=filepath, full_file=True, cols_to_load=['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']) node_edge_pairs = data_loader.load() print("Data load from file took {} seconds".format(time.time() - start_time)) start_time = time.time() if graph_type == 'digraph': G = nx.DiGraph() elif graph_type == 'multidigraph': G = nx.MultiDiGraph() # Initialize loading counter. i = 1 # Could also probably use 'add_edges_from' here: G.add_edges_from(node_edge_pairs[1:]) for node_from, node_to in node_edge_pairs[1:]: # Counter to track progress loading if i % 10000 == 0: print(i) i += 1 # Add nodes G.add_edge(node_from, node_to) end_time = time.time() print("Data load into graph took {} seconds.".format(end_time - start_time)) return G
def Main(): # load the data data = DataLoader() lambdas = [(i + 1) * 2.0 for i in range(32)] batch_size = 500 num_batch = 2 sigma = 1.0 noise = 1e-3 models = [] keys = data.keys() for i in range(len(lambdas)): model = GraphLasso(lambdas[i], batch_size=batch_size, max_iter_outer=20, max_iter_inner=20, eps=1e-4) models.append(model) for i in range(len(models)): sum = np.zeros(shape=(len(keys) * len(keys), )) list_theta = [] for each_batch in range(num_batch): X_batch, y_batch = data.sample_batch(500) # estimate the precision matrix theta = models[i].estimate(X_batch) sum = sum + theta.flat list_theta.append(theta.flat) list_theta = np.stack(list_theta, axis=0) # compute the training error mean = sum / float(num_batch) """ cov = np.cov(np.transpose(list_theta)) cov = cov + noise * np.identity(cov.shape[0]) dist = multivariate_normal(mean=mean, cov=cov) surrogate = 0.0 for i in range(num_batch): log_prob = dist.logpdf(list_theta[i]) surrogate = surrogate + log_prob train_error = -2 * surrogate / float(batch_size) df = float(len(keys) * (len(keys) - 1)) / 2.0 AIC = train_error + 2.0 * df * sigma * sigma / float(num_batch) print("AIC for lambda {}: {}".format(lambdas[i], AIC)) """ adj_matrix = np.reshape(mean, newshape=(len(keys), len(keys))) # compute the adjacent matrix adj_matrix[np.abs(adj_matrix) < 1e-9] = 0 adj_matrix = np.abs(np.sign(adj_matrix)) for diag in range(adj_matrix.shape[0]): adj_matrix[diag, diag] = 0 graph = adj_matrix_to_graph(adj_matrix, keys, lambdas[i]) graph.layout(prog='circo') graph.draw("./graphWithLambdaValue{}.png".format(lambdas[i]))
def multigraph_node_only_title_load_and_pickle(): title_data_loader = DataLoader( 'data/soc-redditHyperlinks-title.tsv', full_file=True, cols_to_load=['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']) title_data = title_data_loader.load() nx_graph_creator = NetworkXGraphCreator() G = nx_graph_creator.create_graph(title_data, graph_type='multidigraph') pickle_obj(G, "data_pickle/networkx_multigraph_title.pickle")
def full_title_load_and_pickle(): title_data_loader = DataLoader('data/soc-redditHyperlinks-title.tsv', full_file=True) title_data = title_data_loader.load() nx_attr_graph_creator = NetworkXAttributeGraphCreator() G_attr = nx_attr_graph_creator.create_graph( title_data, attr_names=['post_id', 'timestamp', 'post_label', 'post_properties'], graph_type='multidigraph') pickle_obj(G_attr, "data_pickle/networkx_attr_full_title.pickle")
def main(): "main function" # optional command line args parser = argparse.ArgumentParser() parser.add_argument('--train', help='train the NN', action='store_true') parser.add_argument('--validate', help='validate the NN', action='store_true') parser.add_argument('--beamsearch', help='use beam search instead of best path decoding', action='store_true') parser.add_argument( '--wordbeamsearch', help='use word beam search instead of best path decoding', action='store_true') parser.add_argument('--dump', help='dump output of NN to CSV file(s)', action='store_true') args = parser.parse_args() decoderType = DecoderType.BestPath if args.beamsearch: decoderType = DecoderType.BeamSearch elif args.wordbeamsearch: decoderType = DecoderType.WordBeamSearch # train or validate on IAM dataset if args.train or args.validate: # load training data, create TF model loader = DataLoader(FilePaths.fnTrain, Model.batchSize, Model.imgSize, Model.maxTextLen) # save characters of model for inference mode open(FilePaths.fnCharList, 'w').write(str().join(loader.charList)) # save words contained in dataset into file open(FilePaths.fnCorpus, 'w').write( str(' ').join(loader.trainWords + loader.validationWords)) # execute training or validation if args.train: model = Model(loader.charList, decoderType) train(model, loader) elif args.validate: model = Model(loader.charList, decoderType, mustRestore=True) validate(model, loader) # infer text on test image else: print(open(FilePaths.fnAccuracy).read()) model = Model(open(FilePaths.fnCharList).read(), decoderType, mustRestore=True, dump=args.dump) infer(model, FilePaths.fnInfer)
def fit(self, saveModel=True, saveuser_list=True, saveISBN_list=True, rating_num=10, ratingsFile="../data/BX-Book-Ratings.csv"): """ 加载并训练模型 :param save:是否保存模型到本地 :param saveuser_list:是否保存用户列表到本地 :param saveISBN_list:是否保存书籍列表到本地 :param rating_num:加载数据的条数 :return: """ dataLoader = DataLoader() # num: 获取的数据条数,决定了后边处理数据的时间,以及预测评分的时间 ratings = dataLoader.getDataFrame(ratingsFile, ";", "utf-8", num=rating_num) self.R, self.user_list, self.ISBN_list = dataLoader.processDataFrametoArray(ratings) if saveModel: np.save("../Model/BookRecommendedModel.npy", self.R) if saveuser_list: with open("../Model/user_list", "w+", encoding="UTF-8") as f: f.write(str(self.user_list)) if saveISBN_list: with open("../Model/ISBN_list", "w+", encoding="UTF-8") as f: f.write(str(self.ISBN_list))
def main(): config = ConfigReader() db = FitbitDatabase(config) # db.update_heart_rate() # db.update_sleep() dl = DataLoader(db) # hr_data = dl.get_heart_rate_data(start_date, end_date) # sleep_data = dl.get_sleep_data(start_date, end_date) # # print(hr_data) # print(sleep_data) da = DataAnalyzer(dl) mati_sleep_score = da.get_sleep_scores( get_date_from_string(MATI_START_DATE), get_date_from_string(MATI_END_DATE)) mati_sleep_score = da.get_sleep_scores( get_date_from_string(MATI_START_DATE), get_date_from_string(MATI_END_DATE))
def main(): """ Main function """ # Opptional command line args parser = argparse.ArgumentParser() parser.add_argument( "--train", help="train the neural network", action="store_true") parser.add_argument( "--validate", help="validate the neural network", action="store_true") parser.add_argument( "--wordbeamsearch", help="use word beam search instead of best path decoding", action="store_true") args = parser.parse_args() decoderType = DecoderType.BestPath if args.wordbeamsearch: decoderType = DecoderType.WordBeamSearch # Train or validate on Cinnamon dataset if args.train or args.validate: # Load training data, create TF model loader = DataLoader(FilePaths.fnTrain, Model.batchSize, Model.imgSize, Model.maxTextLen, load_aug=True) # Execute training or validation if args.train: model = Model(loader.charList, decoderType) train(model, loader) elif args.validate: model = Model(loader.charList, decoderType, mustRestore=False) validate(model, loader) # Infer text on test image else: print(open(FilePaths.fnAccuracy).read()) model = Model(open(FilePaths.fnCharList).read(), decoderType, mustRestore=False) infer(model, FilePaths.fnInfer)
def create_and_pickle_combined_multigraph(): print("loading body data") body_data_loader = DataLoader( 'data/soc-redditHyperlinks-body.tsv', full_file=True, cols_to_load=['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']) body_data = body_data_loader.load() print("body data length:", len(body_data)) print("loading title data") title_data_loader = DataLoader( 'data/soc-redditHyperlinks-title.tsv', full_file=True, cols_to_load=['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']) title_data = title_data_loader.load() print("title data length:", len(title_data)) print("creating graphs") networkx_loader = NetworkXGraphCreator() body_graph = networkx_loader.create_graph(node_edge_pairs=body_data, graph_type='multidigraph') print("body graph nodes:", body_graph.number_of_nodes()) print("body graph edges:", body_graph.number_of_edges()) title_graph = networkx_loader.create_graph(node_edge_pairs=title_data, graph_type='multidigraph') print("title graph nodes:", title_graph.number_of_nodes()) print("title graph edges:", title_graph.number_of_edges()) print("combining graphs") combined_graph = combine_graphs(body_graph, title_graph) print("combined graph nodes:", combined_graph.number_of_nodes()) print("combined graph edges:", combined_graph.number_of_edges()) print("pickling combined graph") pickle_obj(combine_graphs, "data_pickle/networkx_multigraph_combined.pickle")
def main(path_): dl = DataLoader(path_) X = dl.clustering() clust = Clustering(X) clust.process()
'Recuperados': recovery_total }, ignore_index=True) return raw_df def process_town_names_by_region(raw_df: pd.DataFrame) -> dict: names = {} for region_id in raw_df['id_region'].unique(): sub_df = raw_df[raw_df['id_region'] == region_id] names[region_id] = list(sub_df['nombre_comuna'].unique()) return names # getting and processing data dataLoader = DataLoader() region_ids = dataLoader.REGION_IDS # region stats country_stats = dataLoader.get_country_stats() last_day, last_df = dataLoader.get_last_day() reg_latest_accum = process_daily_df(last_df) reg_stats = dataLoader.get_country_data() region_opts = [{ 'label': region_ids[region_id], 'value': region_id } for region_id in region_ids.keys()] # town data town_stats = dataLoader.get_region_data() town_names = process_town_names_by_region(town_stats[list(
def train_model(model, train_dir): loader = DataLoader(Model.batchSize, Model.imgSize, Model.maxTextLen, train_dir) logging.info('Model chars: {chars}.'.format(chars=loader.charList)) train(model, loader)
def run(filename): "main function" # optional command line args parser = argparse.ArgumentParser() parser.add_argument('--train', help='train the NN', action='store_true') parser.add_argument('--validate', help='validate the NN', action='store_true') parser.add_argument('--beamsearch', help='use beam search instead of best path decoding', action='store_true') parser.add_argument( '--wordbeamsearch', help='use word beam search instead of best path decoding', action='store_true') parser.add_argument('--dump', help='dump output of NN to CSV file(s)', action='store_true') args = parser.parse_args() decoderType = DecoderType.BestPath if args.beamsearch: decoderType = DecoderType.BeamSearch elif args.wordbeamsearch: decoderType = DecoderType.WordBeamSearch # train or validate on IAM dataset if args.train or args.validate: # load training data, create TF model loader = DataLoader(FilePaths.fnTrain, Model.batchSize, Model.imgSize, Model.maxTextLen) # save characters of model for inference mode open(FilePaths.fnCharList, 'w').write(str().join(loader.charList)) # save words contained in dataset into file open(FilePaths.fnCorpus, 'w').write( str(' ').join(loader.trainWords + loader.validationWords)) # execute training or validation if args.train: model = Model(loader.charList, decoderType) train(model, loader) elif args.validate: model = Model(loader.charList, decoderType, mustRestore=True) validate(model, loader) # infer text on test image else: index_list = [] result_list = [] prob_list = [] print(open(FilePaths.fnAccuracy).read()) model = Model(open(FilePaths.fnCharList).read(), decoderType, mustRestore=True, dump=args.dump) for dirpath, dirnames, files in os.walk('../output_words/' + filename, topdown=False): for sub_file in sorted(files, key=getint): img_path = dirpath + '/' + sub_file # print('---------------------------------------------------') index_number, _ = str(sub_file).split('.') # print("File path: "+img_path) try: result, prob = infer(model, img_path) except ValueError: print("Value error") continue # print(index_number, result, prob) index_list.append(index_number) result_list.append(result) prob_list.append(prob) return index_list, result_list, prob_list
help='Text file for summarization (encoding:"utf-8_sig")') parser.add_argument("-n", default=3, type=int, help='Numbers of extraction summaries') parser.add_argument( "-lang", default='en', type=str, help= 'If language of article isn\'t Englisth, will automatically translate by google' ) parser.add_argument( "--super_long", action='store_true', help='If length of article >512, this option is needed') args = parser.parse_args() # if args.super_long: # sys.stdout.write('\n<Warning: Number of extractions might slightly altered since with --super_long option>\n') # Language initiator lf = LangFactory(args.lang) translator = None if args.lang in lf.support_lang else TranslatorY() data = DataLoader(args.txt_file, args.super_long, args.lang, translator).data model = ModelLoader(lf.toolkit.cp, lf.toolkit.opt, args.lang) summarizer = Summarizer(data, model, args.n, translator)
from src.DataLoader import DataLoader import dash import dash_table import pandas as pd dataLoader = DataLoader() stats = dataLoader.get_country_stats() day, df = dataLoader.get_last_day() region_ids = dataLoader.REGION_IDS def process_daily_df(raw_df: pd.DataFrame) -> pd.DataFrame: raw_df = raw_df.copy(deep=True) raw_df.sort_values(by='id_reg', inplace=True) raw_df.rename(columns={ 'nombre_reg': 'Región', 'casos_totales': 'Casos', 'fallecidos_totales': 'Fallecidos', 'recuperados_totales': 'Recuperados' }, inplace=True) raw_df.drop(columns=[ 'casos_nuevos', 'fallecidos_nuevos', 'recuperados_nuevos', 'id_reg' ], inplace=True) return raw_df df = process_daily_df(df) app = dash.Dash(__name__)