def build_dataset(args): for split in ['train', 'val', 'test']: print(split) t1 = time() print('start building dataset') if args.worker_num == 1 and cpu_count() > 1: print( '[INFO] There are %d CPUs in your device, please increase -worker_num to speed up' % (cpu_count())) print( " It's a IO intensive application, so 2~10 may be a good choise" ) files = list(iter_files(os.path.join(args.source_dir, split))) data_num = len(files) group_size = data_num // args.worker_num groups = [] for i in range(args.worker_num): if i == args.worker_num - 1: groups.append(files[i * group_size:]) else: groups.append(files[i * group_size:(i + 1) * group_size]) p = Pool(processes=args.worker_num) multi_res = [p.apply_async(worker, (fs, )) for fs in groups] res = [res.get() for res in multi_res] with open(os.path.join(args.target_dir, "%s.json" % split), 'w') as f: for row in chain(*res): f.write(json.dumps(row, ensure_ascii=False) + "\n") t2 = time() print('Time Cost : %.1f seconds' % (t2 - t1))
def eval_distance_to_orig(): all_sents = None with open(sys.argv[1]) as f: timeline = timelines.Timeline.from_file(f) all_sents = list(sent for date in timeline for sent in timeline[date]) docs = [] reader = StanfordXMLReader() for dirname in iter_dirs(sys.argv[2]): for filename in iter_files(dirname, ".htm.cont.tokenized"): try: docs.append(reader.run(filename)) except: pass tfidf = TfidfVectorizer(stop_words=None) tfidf.fit(map(lambda d: d.plaintext, docs)) sent_vecs = tfidf.transform( map(lambda s: " ".join(s.as_token_attr_sequence("form")), [sent for doc in docs for sent in doc.sentences])) tl_vecs = tfidf.transform(all_sents) sims = cosine_similarity(tl_vecs, sent_vecs) max_sims = np.max(sims, 1) print(max_sims) print("Median", np.median(max_sims)) print("Min", np.min(max_sims)) print(all_sents[np.argmin(max_sims)]) print("Max", np.max(max_sims))
def remove_old(path): files = iter_files(path) for file in files: filename = os.path.basename(file) print(filename) f = open(os.path.join('./final', filename), 'a+') lines = readlines(file) i = 0 cnt = 0 for line in lines: article = json.loads(line) if 'year' in article: if int(article['year']) >= 2000: if "author" in article: article['author'] = list(set(article['author'])) tmp = json.dumps(article) f.write(tmp + '\n') f.flush() i += 1 # if i % 100000 == 0: # print(i) else: cnt += 1 else: cnt += 1 print('%s skip:%d, save %d' % (filename, cnt, i))
def extract_json_dir(src, des): i = 0 files = list(iter_files(src)) length = len(files) for id, file in enumerate(files): print("%d/%d" % (id, length)) paper = json.load(open(file)) a = len(paper['abstract'].split()) b = len(paper['article'].split()) if a > b: continue abs_len = len(paper["abstract"].split()) if abs_len > 210: continue abstract = clean_abs(paper['abstract']) if len(abstract) < 2: continue article = clean_text(paper["article"]) if len(article) < 2: continue conclusion = clean_text(paper["conclusion"]) paper["abstract"] = abstract paper["article"] = article paper["conclusion"] = conclusion json.dump(paper, open(os.path.join(des, "%d.json" % i), 'w'), indent=4) i += 1
def read_duc2004_gold_summaries(gold_dir): gold_summaries = defaultdict(dict) for dirname in iter_dirs(gold_dir): for filename in iter_files(dirname, ""): if os.path.basename(filename).startswith("APW"): content = read_summary_file(filename) gold_summaries[os.path.basename(dirname)][os.path.basename( filename)] = content
def parse(dir_name='交运物流'): generator = iter_files(dir_name) for f in generator: try: with open(f, 'r') as o: doc = json.read(o) _parseDoc(doc) print('succeed to parse file {}'.format(f)) except: print('fail to parse file {}'.format(f))
def _extract_paths_labels_groups( self) -> Tuple[List[Path], List[str], List[int]]: file_paths = [] labels = [] groups = [] paths_labels_groups = None metadata = None if self._metadata is not None: metadata = pd.read_csv(self._metadata) if self._path_column in metadata: metadata[self._path_column] = metadata[ self._path_column].apply(Path) if self._label_column in metadata: path_existence = metadata[self._path_column].apply( Path.exists) metadata = metadata.loc[path_existence] metadata[self._label_column] = metadata[ self._label_column].apply(str) paths_labels_groups = metadata[[ self._path_column, self._label_column ]] if paths_labels_groups is None and self._data_root: paths_labels_groups = pd.DataFrame() for class_folder in utils.iter_folders(self._data_root): label = class_folder.name for file_path in utils.iter_files(class_folder): if self.is_file_integral(file_path): paths_labels_groups = paths_labels_groups.append( { self._path_column: file_path, self._label_column: label }, ignore_index=True, sort=False) if paths_labels_groups is not None: if self._with_shuffle: paths_labels_groups = paths_labels_groups.sample( frac=1, replace=False, axis=0, random_state=self._seed) file_paths = paths_labels_groups[self._path_column].to_list() labels = paths_labels_groups[self._label_column].to_list() if self._group and metadata is not None and \ all(c in metadata.columns for c in [self._path_column, self._group]): groups = metadata.set_index(self._path_column)\ .loc[file_paths, self._group] return file_paths, labels, groups
def store_contents(data_path, save_path, num_workers=4, num_files=5): """Preprocess and store a corpus of documents in sqlite. Args: data_path: Root path to directory (or directory of directories) of files containing json encoded documents (must have `id` and `text` fields). save_path: Path to output sqlite db. num_workers: Number of parallel processes to use when reading docs. num_files: Split db in to num_files files. """ logger.info('Reading into database...') files = [f for f in utils.iter_files(data_path)] if num_files == 1: filelist = [files] else: one_length = len(files) // num_files + 1 filelist = [[files[i * one_length + j] for j in range(one_length)] for i in range(num_files - 1)] filelist.append(files[one_length * (num_files - 1):]) for i, files in enumerate(filelist): logger.info('Building %i-th db...' % i) temp_save_path = os.path.join(save_path, 'fever%i.db' % i) if os.path.isfile(temp_save_path): raise RuntimeError('%s already exists! Not overwriting.' % temp_save_path) conn = sqlite3.connect(temp_save_path) c = conn.cursor() c.execute("CREATE TABLE documents (id PRIMARY KEY, text);") workers = ProcessPool(num_workers) count = 0 with tqdm(total=len(files)) as pbar: for pairs in tqdm(workers.imap_unordered(get_contents, files)): count += len(pairs) c.executemany("INSERT INTO documents VALUES (?,?)", pairs) pbar.update() logger.info('Read %d docs.' % count) logger.info('Committing...') conn.commit() conn.close()
def extract_json(src, des): i = 0 files = list(iter_files(src)) for file in tqdm(files): # print(file) tmp = json.load(open(file))['paper'] paper = dict() if "abstract" in tmp: conclusion = '' abs_len = len(' '.join(tmp["abstract"]).split()) if abs_len > 210: continue flag = False for sec in tmp['sections']: if "introduction" in sec.lower(): int_len = len(' '.join(tmp["sections"][sec]).split()) if int_len > 1000 or abs_len > int_len: break abstract = clean_abs(' '.join(tmp["abstract"])) if len(abstract) < 2: break introduction = clean_text(' '.join(tmp["sections"][sec])) if len(introduction) < 2: break flag = True if "conclusion" in sec.lower() and flag: con_len = len(' '.join(tmp["sections"][sec]).split()) if con_len > 800: conclusion = '' break conclusion = clean_text(' '.join(tmp["sections"][sec])) break if flag: paper["abstract"] = abstract paper["article"] = introduction paper["conclusion"] = conclusion name = os.path.basename(file) name, _ = os.path.splitext(name) paper["id"] = name json.dump(paper, open(os.path.join(des, "%s.json" % name), 'w'), indent=4) i += 1
def split(src, ratio=0.94): files = list(iter_files(src)) random.shuffle(files) len_train = int(len(files) * ratio) len_val = int(len(files) * (1 - ratio) / 2) len_test = len(files) - len_train - len_val train = files[:len_train] val = files[len_train:len_train + len_val] test = files[-len_test:] if not exists(join(src, 'train')): os.makedirs(join(src, 'train')) if not exists(join(src, 'test')): os.makedirs(join(src, 'test')) if not exists(join(src, 'val')): os.makedirs(join(src, 'val')) for each in train: shutil.move(each, join(src, 'train')) for each in test: shutil.move(each, join(src, 'test')) for each in val: shutil.move(each, join(src, 'val'))
def dump(split): start = time() print('start processing {} split...'.format(split)) data_dir = join(DATA_DIR, split) dump_dir = join(DATA_DIR, 'refs', split) n_data = count_data(data_dir) for i, file in enumerate(iter_files(data_dir)): print('processing {}/{} ({:.2f}%%)\r'.format(i, n_data, 100 * i / n_data), end='') name = os.path.basename(file) name, _ = os.path.splitext(name) with open(join(data_dir, '{}.json'.format(name))) as f: data = json.loads(f.read()) abs_sents = data['abstract'] with open(join(dump_dir, '{}.ref'.format(name)), 'w') as f: f.write(make_html_safe('\n'.join(abs_sents))) print('finished in {}'.format(timedelta(seconds=time() - start)))
def cross_eval_main(): parser = argparse.ArgumentParser() parser.add_argument("corpus_def") parser.add_argument("config") parser.add_argument("param_file") args = parser.parse_args() corpora_and_timelines = [] with open(args.corpus_def) as f: corpus_defs = json.load(f) for corpus_def in corpus_defs["corpora"]: timeline_dir = corpus_def["tl_dir"] corpus_pickle = corpus_def["corpus_pkl"] corpus = load_corpus(corpus_pickle) timelines = [] for tl_fname in iter_files(timeline_dir, ".txt"): with open(tl_fname, encoding="latin-1") as f: timeline = Timeline.from_file(f) timelines.append((os.path.basename(tl_fname), timeline)) corpora_and_timelines.append((corpus, timelines)) with open(args.config) as f: config = json.load(f) tl_gen = GloballyClusteredSentenceCompressionTimelineGenerator(config) parameters = tl_gen.run_scoring_cv_train_mode(corpora_and_timelines) with open(args.param_file, "wb") as f_out: pickle.dump(parameters, f_out)
#parser.add_argument("human_tl_dir") parser.add_argument("system_tl_dir") parser.add_argument("relevant_systems", nargs="+") parser.add_argument("outfile") args = parser.parse_args() relevant_systems = set(args.relevant_systems) all_relevant_timelines = defaultdict(lambda: defaultdict(dict)) for directory in iter_dirs(args.system_tl_dir): system_name = os.path.basename(directory) for tl_dir in iter_dirs(directory): for tlfilename in iter_files(tl_dir, ".txt"): #print(system_name, relevant_systems) if system_name in relevant_systems: with open(tlfilename) as tlfile: all_relevant_timelines[system_name][os.path.basename( tl_dir)][os.path.basename( tlfilename)] = Timeline.from_file(tlfile) #for directory in iter_dirs(args.human_tl_dir): # source_name = os.path.basename(directory) # for tlfilename in iter_files(directory, ".txt"): # with open(tlfilename, errors='ignore') as tlfile: # all_relevant_timelines["human"][source_name][os.path.basename(tlfilename)] = Timeline.from_file(tlfile) vectorized_timelines = vectorize_timelines(all_relevant_timelines)
from utils import iter_files, make_vocab import json import re import shutil from tqdm import tqdm from utils import split s_path = r'E:\HTML\a' d_path = r'E:\DATASET\arxiv_json\arxiv_html' for file in iter_files(s_path): try: shutil.move(file, d_path) except: pass # in_path = '/home/yhj/dataset/conference_json' # path = '/home/yhj/dataset/emnlp' # path = r'E:\DATASET\arxiv_tex' # for file in tqdm(list(iter_files(path))): # paper = json.load(open(file)) # art = paper['article'] # abs = paper['abstract'] # con = paper['conclusion'] # paper['article'] = [' '.join(each.split()) for each in art] # paper['abstract'] = [' '.join(each.split()) for each in abs] # paper['conclusion'] = [' '.join(each.split()) for each in con] # json.dump(paper,open(file,'w'),indent=4) # split(path,'/home/yhj/dataset/emnlp') # make_vocab(in_path,path)
parser = argparse.ArgumentParser() parser.add_argument('db_path', type=str, default=None, help='Path to sqlite db holding document texts') parser.add_argument('out_dir', type=str, default=None, help='Directory for saving output files') parser.add_argument('--ngram', type=int, default=1, help=('Use up to N-size n-grams ' '(e.g. 2 = unigrams + bigrams)')) parser.add_argument('--hash-size', type=int, default=int(math.pow(2, 24)), help='Number of buckets to use for hashing ngrams') parser.add_argument('--num-workers', type=int, default=4, help='Number of CPU processes (for tokenizing, etc)') args = parser.parse_args() db_files = [f for f in utils.iter_files(args.db_path)] for i, f in enumerate(db_files): logger.info('Processing file %i...' % i) logger.info('Counting words...') count_matrix, doc_dict = get_count_matrix( args, 'sqlite', {'db_path': f} ) logger.info('Getting word-doc frequencies...') freqs = get_doc_freqs(count_matrix) basename = os.path.splitext(os.path.basename(f))[0] basename += ('-ngram=%d-hash=%d' %
break paper['extracted'] = extracted paper['score'] = scores json.dump(paper,open(save_name,'w'),indent=4) if __name__ == "__main__": path = '/home/yhj/dataset/emnlp_mix_int_he' for split in ['train','test','val']: print("labeling %s..." % split) data_path = os.path.join(path, split) files = list(iter_files(data_path)) t1 = time.time() save_path = os.path.join(path, "%s" % split) if not os.path.exists(save_path): os.makedirs(save_path) with mp.Pool() as pool: list(pool.imap_unordered(label(save_path),files,chunksize=1024)) # for file in tqdm(files): # # p.apply_async(func=label, args=(file, save_name)) t2 = time.time() print('%s time cost : %.1f seconds' % (split, (t2 - t1)))
logging.basicConfig(level=logging.INFO) logger = logging.getLogger("Merge count matrix") parser = argparse.ArgumentParser() parser.add_argument('ct_path', type=str, default=None, help='Path to count matrices') parser.add_argument('out_dir', type=str, default=None, help='Directory for saving output files') args = parser.parse_args() ct_files = [f for f in utils.iter_files(args.ct_path)] logger.info('Loading the zeroth count matrix...') mat, metadata = utils.load_sparse_csr(ct_files[0]) DOC2IDX, doc_ids = metadata['doc_dict'] for i in range(1, len(ct_files)): logger.info('Loading %ith count matrix...' % i) nxt_mat, nxt_metadata = utils.load_sparse_csr(ct_files[i]) if metadata['hash_size'] != nxt_metadata['hash_size']: raise RuntimeError('hash_size not equal in %ith file' % i) if metadata['ngram'] != nxt_metadata['ngram']: raise RuntimeError('ngram not equal in %ith file' % i)
if __name__ == "__main__": save_root = '/home/yhj/dataset/conference_json' root_path = '/home/yhj/dataset/conference' # debug = False global save_path for path in os.listdir(root_path): path = os.path.join(root_path, path) print(path) save_path = os.path.join(save_root, basename(path)) if not os.path.exists(save_path): os.makedirs(save_path) start = len(os.listdir(save_path)) # if debug:start=0 print("start with %d" % start) files = list(iter_files(path)) start_time = time.time() for i in tqdm(range(start, len(files))): extract(i, files[i]) delete_skip(save_root) # split(r'E:\conference')
def delete_skip(path): for file in iter_files(path): if file.endswith('skip'): os.remove(file)
batch_size = 64 #Batch size for training epochs = 2 #Number of epochs to train for. latent_dim = 256 #Latent dimensionality of the encoding space num_samples = 10000 #Number of samples to train on #Path to the data txt file on disk. data_path = 'data/raw_sent_pair/fluency/' #Vectorize the data. input_texts = [] target_texts = [] input_characters = set() target_characters = set() for file in utils.iter_files(data_path): with open(file, 'r') as f: lines = f.read().split('\n') for line in lines[:min(num_samples, len(lines) - 1)]: input_text, target_text = line.split('\t') #We use "tab" as the "start sequence" character # for the targets, and "\n" as "end sequence" character. target_text = '\t' + target_text + '\n' input_texts.append(input_text) target_texts.append(target_text) for char in input_text: if char not in input_characters: input_characters.add(char) for char in target_text: if char not in target_characters: target_characters.add(char)