for p in glob.glob(glob_expression): original_filename = build_original_filename(p) path_doc = os.path.join(args.corpus, original_filename) f = open(path_doc, 'r') d = json.load(f) f.close() print path_doc cpt_doc = 0 cpt_cut = 0 len_content = 0 for url, info in d.iteritems(): dict_ngram_url = {} bs_content = BeautifulSoup(info['content']) cut = tbs.cut_bloc(bs_content.body) for c in cut: cut2bs = tbs.cut_bloc2bs_elt(c) for s in cut2bs.strings: len_content += len(s) cpt_cut += 1 cpt_doc += 1 res['global']['nbMessages'] += cpt_doc res['global']['nbBlocks'] += cpt_cut res['global']['nbCars'] += len_content dict_ngram_author = { 'nbMessages': cpt_doc, 'nbBlocks': cpt_cut, 'nbCars': len_content
print 'OUTPUTDIR %s does not exist, create it or choose an other directory'%(args.diroutput) exit(0) ## # args.fileoutput ## fileoutput = build_json_filename_output(args.path) if args.fileoutput == '' else args.fileoutput output_json = os.path.join(args.diroutput, fileoutput) dict_ngram_author = {} for url, info in d.iteritems() : dict_ngram_url = {} bs_content = BeautifulSoup(info['title'] + info['content']) cut = tbs.cut_bloc(bs_content.body) res['url'][url] = {'global' : True, 'block':[]} for c in cut : cut2bs = tbs.cut_bloc2bs_elt(c) dict_ngram_block = {} content = ' '.join([s.strip() for s in cut2bs.strings]) tf.ngram_extractor(content, args.sizengram, dict_ngram_author, dict_ngram_url, dict_ngram_block) res['url'][url]['block'].append(dict_ngram_block) res['url'][url]['global'] = dict_ngram_url res['global'] = dict_ngram_author f = open(output_json, 'w') json.dump(res, f) f.close()