def main(args): outdir = pathlib.Path(args.o) if outdir.exists(): shutil.rmtree(outdir) outdir.mkdir() tmp_clusters_path = outdir / 'tmp_clusters.jsonl' if tmp_clusters_path.exists(): tmp_clusters_path.unlink() # get article -> cluster mappings clusters = list(utils.read_jsonl(args.dataset)) url_to_cluster_idxs, id_to_cluster_idx = get_article_to_cluster_mappings( clusters) # add articles from WCEP to clusters, using URLs add_wcep_articles_to_clusters(args.wcep_articles, url_to_cluster_idxs, clusters) # add articles from CommonCrawl to clusters, using IDs add_cc_articles_to_clusters(clusters, args.cc_articles, id_to_cluster_idx, tmp_clusters_path) # split clusters into separate train/val/test files split_dataset(outdir, tmp_clusters_path) tmp_clusters_path.unlink() for fn in ['train.jsonl', 'val.jsonl', 'test.jsonl']: cleanup_clusters(outdir / fn, tmp_clusters_path)
def main(): twitter = Twitter() komoran = Komoran() mecab = Mecab() argv_dict = {'twitter': twitter, 'komoran': komoran, 'mecab': mecab} if len(sys.argv) < 2: print('please insert sys_argv \n', '1) tokenizer selection: twitter, komoran, mecab \n', '2) twitter_tokenizer_option: norm [no argv means True] \n', '3) twitter_tokenizer_option: stemming [no argv means True]') else: output_file_path = input('output_text_file_name: ') output_file_path = './data/' + output_file_path input_file_path = input('input_text_file_name: ') input_file_path = './data/' + input_file_path twitter_option = [bool(sys.argv[2]), bool(sys.argv[3])] if len( sys.argv) == 4 else [True, True] app_id_list, app_name_list, cate_list, rating_list, review_list = read_jsonl( input_file_path, key_ma=False) ma_list = [] for review in tqdm(review_list, desc='tokenizing', total=len(review_list)): ma_tokens = get_pos(tokenizer=argv_dict[sys.argv[1]], doc=review, twi_norm=twitter_option[0], twi_stem=twitter_option[1]) ma_list.append(ma_tokens) save_jsonl(output_file_path, app_id_list, app_name_list, cate_list, rating_list, ma_list)
def main(): # load arguments args = parse_args() # define logging level and format level = logging.INFO if args.debug: level = logging.DEBUG logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=level) # create model depending on model type logging.info('Init WordEmbedding ...') if "tamperednews" in args.dataset: language = "en" else: # news400 language = "de" we = WordEmbedder(fasttext_bin_folder=args.fasttext, language=language, token_types=args.tokens) # create output dir if not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output)) # read dataset dataset = utils.read_jsonl(args.dataset, dict_key='id') # check if output file already exists logging.info('Generate word embeddings ...') if os.path.isfile(args.output): mode = 'r+' else: mode = 'a' # save embeddings to h5 with h5py.File(args.output, mode) as output_file: cnt_docs = 0 for document in dataset.values(): cnt_docs += 1 if cnt_docs % 100 == 0: logging.info(f'{cnt_docs} documents processed') if document['id'] in output_file: logging.debug(f"{document['id']} already processed") continue if "text" not in document.keys(): output_file[document['id']] = [] continue output_file[document['id']] = we.generate_embeddings( document["text"]) return 0
def read_article_ids(path, max_cluster_size): id_to_collection = {} ids = set() for cluster in utils.read_jsonl(path): articles = cluster['cc_articles'] if max_cluster_size != -1: articles = articles[:max_cluster_size] for a in articles: ids.add(a['id']) id_to_collection[a['id']] = cluster['collection'] return ids, id_to_collection
def main(): # load arguments args = parse_args() # define logging level and format level = logging.INFO if args.debug: level = logging.DEBUG logging.basicConfig(format="%(asctime)s %(levelname)s:%(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=level) # read urls from input file if args.type == "news": dataset = utils.read_jsonl(args.input, dict_key="id") if not os.path.exists(args.output): os.makedirs(args.output) else: dataset = utils.read_jsonl(args.input, dict_key="wd_id") image_urls = [] for doc in dataset.values(): if args.type == "news": outfile = os.path.join(args.output, doc["id"] + ".jpg") image_urls.append([outfile, doc["image_url"], args.maxsize]) else: for image in doc["image_urls"]: outfile = os.path.join(args.output, doc["wd_id"], image["filename"]) if not os.path.exists(os.path.dirname(outfile)): os.makedirs(os.path.dirname(outfile)) image_urls.append([outfile, image["url"], args.maxsize]) # download images with multiprocessing.Pool(args.threads) as p: p.map(download_news_images, image_urls) return 0
def Kmeans_clustering(load_file_path, vectorizer, n_cluster=None): app_id_list, app_name_list, cate_list, rating_list, ma_list = read_jsonl( load_file_path) # word_idx = load_json('./data/word_idx_json_by_app_id.txt') spaced_ma_list = ma_transform_by_spacing(ma_list) coo_matrix, words, _ = get_matrix(vectorizer, spaced_ma_list) if not n_cluster: n_cluster = len(set(cate_list)) m = coo_matrix.toarray() model = MiniBatchKMeans(n_clusters=n_cluster, verbose=0, n_init=10, batch_size=100) model.fit(m) return model, words, app_id_list, app_name_list, cate_list, rating_list, ma_list
def add_cc_articles_to_clusters(clusters, cc_path, id_to_cluster_idx, tmp_clusters_path): print('adding articles from CommonCrawl to clusters') n_clusters = len(clusters) n_clusters_done = 0 for i, a in enumerate(utils.read_jsonl(cc_path)): if i % 10000 == 0: print( f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done' ) cluster_idx = id_to_cluster_idx[a['id']] c = clusters[cluster_idx] c.setdefault('cc_articles_filled', []) c['cc_articles_filled'].append(a) if len(c['cc_articles']) == len(c['cc_articles_filled']): utils.write_jsonl([c], tmp_clusters_path, mode='a') clusters[cluster_idx] = None n_clusters_done += 1 print( f'{i} cc articles done, {n_clusters_done}/{n_clusters} clusters done')
def __init__(self, image_dir, testset_path): self.image_dir = image_dir # Read dataset self.dataset = utils.read_jsonl( testset_path, keep_keys={ "image_path": "image_path", "image_hash": "image_hash", "leaf_class_idx": "leaf_class_idx", "leaf_wd_id": "leaf_wd_id", }, ) # Build image transformation self.transform = torchvision.transforms.Compose([ torchvision.transforms.ToPILImage(), torchvision.transforms.Resize(size=224), torchvision.transforms.CenterCrop(size=224), torchvision.transforms.ToTensor(), torchvision.transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), ])
""" Quick example of how to read outputs and make markdown table """ import pandas as pd import json import numpy as np import flatten_dict from utils import read_jsonl, early_stopping_metrics lines = read_jsonl('outputs/grid_search_results.jsonl') metrics = [ early_stopping_metrics(l['metrics_runs'][0]) for l in lines if 'metrics_runs' in l ] args = [pd.DataFrame([l['args']]) for l in lines if 'metrics_runs' in l] runs = [pd.concat([a.T, m], 0).T for a, m in zip(args, metrics)] df_runs = pd.concat(runs) print('columns', df_runs.columns) # choose only some of the cols metrics = [ 'test_hard_metric.Accuracy', 'test_hard_metric.Exact match', # 'test_hard_metrics.F1-Score', # 'test_hard_metrics.ROC AUC', 'test_metric.Accuracy', 'test_metric.Exact match', # 'test_metrics.F1-Score', # 'test_metrics.ROC AUC'
def eval(data_file, data_type, model_name): data = read_jsonl(data_file) return topk_eval(model_name, data, data_type, k=1)
paths['test'] = 'data/test_CNNDM_' + encoder + '.jsonl' return paths path = get_data_path("test","bert") print(path) # # for name in path: # # assert exists(path[name]) # # print(path[name]) datasets = MatchSumPipe(20, "bert").process_from_file(path) print('Information of dataset is:') print(datasets) test_set = datasets.datasets['test'] device = int(0) batch_size = 1 for cur_model in models: print('Current model is {}'.format(cur_model)) # load model model = torch.load(join(save_path, cur_model)) # configure testing dec_path, ref_path = get_result_path(save_path, cur_model) test_metric = MatchRougeMetric(data=read_jsonl(path['test']), dec_path=dec_path, ref_path=ref_path, n_total = len(test_set)) tester = Tester(data=test_set, model=model, metrics=[test_metric], batch_size=batch_size, device=device, use_tqdm=False) tester.test()
def main(): # load arguments args = parse_args() # define logging level and format level = logging.INFO absl.logging.set_stderrthreshold('info') absl.logging.set_verbosity('info') if args.debug: absl.logging.set_stderrthreshold('debug') absl.logging.set_verbosity('debug') level = logging.DEBUG logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=level) # check output file and create folder if not args.output.endswith('.h5'): logging.error('Output file should end with .h5. Extiting ...') return 0 if not os.path.exists(os.path.dirname(args.output)): os.makedirs(os.path.dirname(args.output)) if os.path.isfile(args.output): mode = 'r+' else: mode = 'a' # create model depending on model type FE = None if 'facenet' in args.model: logging.info( 'Building network for embedding based on person verification ...') FE = FacialFeatureExtractor(model_path=args.model) elif 'location' in args.model: logging.info( 'Building network for embedding based on geolocation estimation ...' ) FE = GeoEstimator(model_path=args.model, use_cpu=args.use_cpu) elif 'scene' in args.model: logging.info( 'Building network for embedding based on scene classification ...') FE = SceneClassificator(model_path=args.model) if not FE: logging.error('Unknown model. Exiting ...') return 0 if 'scene' not in args.model and args.logits: logging.error( 'Please specify a scene classification model to create scene logits.' ) return 0 # read dataset if args.type == 'news': dataset = utils.read_jsonl(args.input, dict_key='id') else: dataset = utils.read_jsonl(args.input, dict_key='wd_id') logging.info(f'{len(dataset.keys())} dataset entries to process') # create embeddings with h5py.File(args.output, mode) as output_file: for entry in dataset.values(): images = [] if args.type == 'news': fname = os.path.join(args.directory, entry['id']) images.append({'fname': fname, 'search_engine': None}) else: # entity for image in entry['image_urls']: fname, _ = os.path.splitext(image['filename']) fname = os.path.join(args.directory, entry['wd_id'], fname) images.append({ 'fname': fname, 'search_engine': image['search_engine'] }) for image in images: image_files = glob.glob(image['fname'] + '.*') # get image_file with extension if len(image_files) == 0: logging.info( f"Cannot find image file {image['fname']}.jpg") continue else: image_file = image_files[0] if args.type == 'news': h5_key = entry['id'] else: # entity h5_key = f"{entry['wd_id']}/{image['search_engine']}/{os.path.basename(image['fname'])}" if h5_key in output_file: logging.info( f'Embedding for {h5_key} already computed ...') continue logging.info(f'Generate embedding for {h5_key} ...') img_embeddings = [] if args.logits: img_emb = FE.get_logits(image_file) else: img_emb = FE.get_img_embedding(image_file) for e in img_emb: img_embeddings.append(e) if len(img_embeddings) == 0: logging.debug(f'No embedding found for {h5_key} ...') output_file[h5_key] = [] else: # convert to np array and store to output file id_img_embs = np.asarray(img_embeddings, dtype=np.float32) output_file[h5_key] = id_img_embs return 0
def main(): jobs = list(read_jsonl("./emoji2.jsonl")) print(f"{len(jobs)} jobs to run.") with Pool() as pool: for result in pool.imap_unordered(run_job, jobs, chunksize=10): pass