fw.close() indir = sys.argv[1] outdir = sys.argv[2] mode = sys.argv[3] # highlight/article if mode == 'article': article_dir = '{}/articles'.format(outdir) title_dir = '{}/title'.format(outdir) if not os.path.exists(article_dir): os.makedirs(article_dir) if not os.path.exists(title_dir): os.makedirs(title_dir) params = [(article_dir, title_dir, k) for k in glob('{}/*.html'.format(indir))] print('processing {} files...'.format(len(params))) pool = Pool(cpu_counts()) pool.map(run, params, 1000) pool.close() else: if not os.path.exists(outdir): os.makedirs(outdir) params = [(indir, outdir, k.split('/')[-1].split('.')[0]) for k in glob('{}/*.story'.format(indir))] print('processing {} files...'.format(len(params))) pool = Pool(cpu_counts()) pool.map(extract_highlight, params, 1000) pool.close()
ner.append(prev_ent_type) phrase = [] line.append(word_text) pos.append(word_pos) ner.append(word.ent_type_) prev_ent_type = word.ent_type_ fwl.write('{} .\n'.format(' '.join(line))) fwp.write('{} EOS\n'.format(' '.join(pos))) fwn.write('{} EOS\n'.format(' '.join(ner))) fwl.close() fwp.close() fwn.close() basedir = sys.argv[1] article_dir = sys.argv[2] article_title = sys.argv[3] linedir = '{}/{}_spacy_line/'.format(basedir, article_title) posdir = '{}/{}_spacy_pos/'.format(basedir, article_title) nerdir = '{}/{}_spacy_ner/'.format(basedir, article_title) mkdir_p(linedir) mkdir_p(posdir) mkdir_p(nerdir) filelist = glob('{}/*'.format(article_dir)) print('processing {} files...'.format(len(filelist))) pool = Pool(cpu_counts()) pool.map(run, filelist) pool.close()