def __detect_concepts(input_file, output_file, concepts, window_size): ''' detect tokens and concepts in one file ''' print(input_file) os.makedirs(os.path.dirname(output_file), exist_ok=True) d = detector(ngram(xutils.open_file(input_file), window_size), concepts, window_size) f = xutils.open_file(output_file, 'wt') for ts in d: for t in ts: f.write(t) if t != '\n': f.write(' ') f.close()
def preprocess_merge(input_files, output_file): fout = xutils.open_file(output_file, 'wt') for input_file in input_files: fin = xutils.open_file(input_files, 'rt') lines = fin if 'pmc_ocr' in input_file: lines = sent_tokenize(fin.read().replace('-\n', '').replace('\n', ' ')) for line in lines: new_line = preprocess_line(line) if new_line: fout.write(new_line) fout.write('\n') fin.close() fout.write('\n\n') fout.close()
def extract_pubmed(input_dir, output_dir): os.makedirs(output_dir, exist_ok=True) flags = [["<ArticleTitle>", "</ArticleTitle>"], ["<AbstractText>", "</AbstractText>"]] for fn in glob.glob(input_dir + '/**/*.xml.gz', recursive=True): print(fn) fin = xutils.open_file(fn, 'rt') fout = xutils.open_file( output_dir + '/' + os.path.splitext(os.path.basename(fn))[0] + '.txt', 'wt') for line in fin: for flag in flags: i = line.find(flag[0]) while i > 0: j = line.find(flag[1], i) txt = line[i + 14:j] i = line.find(flag[0], j) fout.write(txt) fout.write('\n') fout.close()
def __corpus_counter(task): print(task) freq = defaultdict(int) with xutils.open_file(task) as file: for line in file: for word in line.strip().split(): freq[word] = freq[word] + 1 tok = word.split('_') if len(tok) > 1: for t in tok: freq[t] = freq[t] + 1 return freq
def wikipedia_extract(input_file, output_dir): #character chunk chunk_size = 50000000 os.makedirs(output_dir, exist_ok=True) fin = xutils.open_file(input_file, 'rt') extractor = extract_pages(fin, ['0']) fout, counter, chunk = None, chunk_size, -1 for page in extractor: if page[1]: text = filter_wiki(page[1]) if counter >= chunk_size: if fout: fout.close() counter, chunk = 0, chunk + 1 output_file = '%s/%s_%d.txt.gz' % ( output_dir, os.path.basename(input_file), chunk) fout = xutils.open_file(output_file, 'wt') print(output_file) counter += len(text) fout.write(text) fout.write('\n\n\n\n') fin.close()