def corpus_parser(data_dir, to_index_dir, pool_size): # data_dir = '/ssd2/francisco/robust_corpus/' # to_index_dir = './robust_dir/robust_corpus/' # pool_size = 10 ir_utils.create_dir(to_index_dir) corpus_files = get_filenames(data_dir) print(len(corpus_files)) # corpus_files = corpus_files[0:10] pool = multiprocessing.Pool(processes=pool_size, initializer=start_process) # pool_outputs = pool.map(baseline_computing, params) process_file_partial = partial(process_file, to_index_dir) pool.map_async(process_file_partial, corpus_files) pool.close() # no more tasks pool.join() # wrap up current tasks
def process_file(to_index_dir, file): filename = file.split('/')[-1:][0] # print(filename) print(to_index_dir) outdir = to_index_dir + '/'.join(file.split('/')[-2:-1]) + '/' ir_utils.create_dir(outdir) file_out = outdir + filename open_tags = ['<H3>', '<HT>', '<TEXT>', '<HEADLINE>'] close_tags = ['</H3>', '</HT>', '</TEXT>', '</HEADLINE>'] try: with open(file, 'rt', encoding="ISO-8859-1") as input_f, open( file_out, 'wt', encoding="utf-8") as out_f: lines = [] # i = 0 open_tag = False for line in input_f: # i += 1 # if i> 200: # break if any(tag in line for tag in close_tags): out_f.write(line) open_tag = False # print(line) continue elif any(tag in line for tag in open_tags): open_tag = True out_f.write(line) # print(line) continue if open_tag: # print('change') line = ir_utils.remove_sc(line) + '\n' out_f.write(line) print('Saved :', file_out) except: print('error processing file :', file)
def corpus_parser(data_dir, to_index_dir, pool_size): # pool_size = 25 # data_dir = '/ssd/francisco/pubmed19/' # to_index_dir = './bioasq_dir/bioasq_corpus/' # TODO Fix, pass to multiprocessing! ir_utils.create_dir(to_index_dir) pubmed_files = get_filenames(data_dir) # assign to the multiprocessing pool pool = multiprocessing.Pool(processes=pool_size, initializer=start_process) # pool_outputs = pool.map(baseline_computing, params) pool.map_async(pubmed_xml_to_json, pubmed_files) pool.close() # no more tasks pool.join() # wrap up current tasks
def build(self): # ir_utils.create_dir(self.index_location) # index_loc_param = '--indexPath=' + index_loc ir_utils.create_dir(self.index_dir) build_index_command = self.ir_toolkit_location + 'buildindex/IndriBuildIndex' toolkit_parameters = [ build_index_command, self.parameter_file_location, self.stopwords_file ] print(toolkit_parameters) proc = subprocess.Popen(toolkit_parameters, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=False) (out, err) = proc.communicate() print(out.decode("utf-8")) print('Index error: ', err) if err == None: return 'Ok'
parser = argparse.ArgumentParser( description='Example 1 - sequential and local execution.') parser.add_argument('--dataset', type=str, help='') parser.add_argument('--data_split', type=str, help='') parser.add_argument('--fold', type=str, help='') args = parser.parse_args() # args = fakeParser() ir_toolkit_location = '../../../indri-l2r/' dataset = args.dataset workdir = './' + dataset + '_dir/' to_index_dir = workdir + dataset + '_corpus/' index_dir = workdir + dataset + '_indri_index' ir_utils.create_dir(workdir) confdir = './' + dataset + '_config/' parameter_file_location = confdir + dataset + '_index_param_file' stopwords_file = confdir + 'stopwords' if (not args.fold or args.dataset == 'bioasq'): args.fold = [''] elif args.fold == 'all': args.fold = ['1', '2', '3', '4', '5'] # args.fold = ['1'] else: args.fold = [args.fold] # Generate all features for robust # Later, divide them according to fold and data_split if args.dataset == 'robust':