Ejemplo n.º 1
0
    fw.close()


indir = sys.argv[1]
outdir = sys.argv[2]
mode = sys.argv[3]  # highlight/article

if mode == 'article':
    article_dir = '{}/articles'.format(outdir)
    title_dir = '{}/title'.format(outdir)
    if not os.path.exists(article_dir):
        os.makedirs(article_dir)
    if not os.path.exists(title_dir):
        os.makedirs(title_dir)

    params = [(article_dir, title_dir, k)
              for k in glob('{}/*.html'.format(indir))]
    print('processing {} files...'.format(len(params)))

    pool = Pool(cpu_counts())
    pool.map(run, params, 1000)
    pool.close()
else:
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    params = [(indir, outdir, k.split('/')[-1].split('.')[0])
              for k in glob('{}/*.story'.format(indir))]
    print('processing {} files...'.format(len(params)))
    pool = Pool(cpu_counts())
    pool.map(extract_highlight, params, 1000)
    pool.close()
Ejemplo n.º 2
0
                ner.append(prev_ent_type)
                phrase = []
            line.append(word_text)
            pos.append(word_pos)
            ner.append(word.ent_type_)
            prev_ent_type = word.ent_type_
        fwl.write('{} .\n'.format(' '.join(line)))
        fwp.write('{} EOS\n'.format(' '.join(pos)))
        fwn.write('{} EOS\n'.format(' '.join(ner)))
    fwl.close()
    fwp.close()
    fwn.close()

basedir = sys.argv[1]
article_dir = sys.argv[2]
article_title = sys.argv[3]
linedir = '{}/{}_spacy_line/'.format(basedir, article_title)
posdir = '{}/{}_spacy_pos/'.format(basedir, article_title)
nerdir = '{}/{}_spacy_ner/'.format(basedir, article_title)

mkdir_p(linedir)
mkdir_p(posdir)
mkdir_p(nerdir)

filelist = glob('{}/*'.format(article_dir))
print('processing {} files...'.format(len(filelist)))

pool = Pool(cpu_counts())
pool.map(run, filelist)
pool.close()