def main(argv): # 读取配置文件 InitConfig() config = GetConfig() config.read([os.path.join(os.path.dirname(__file__),'../config/msiip_simple.cfg')]) # 设置logging log_level_key = config.get('logging','level') run_code_name = os.path.basename(sys.argv[0])[0:-3] logging.basicConfig(filename = os.path.join(os.path.dirname(__file__), '../../output/logs', '%s_%s.log' %(run_code_name,time.strftime('%Y-%m-%d',time.localtime(time.time())))), \ level = GetLogLevel(log_level_key), format = '%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s') parser = argparse.ArgumentParser(description='Extract Semantic Tag Data.') parser.add_argument('sub_utters_file', help='sub_utters_file') parser.add_argument('output', help='Output file') args = parser.parse_args() extractor = SemTagExtractor() output = codecs.open(args.output, 'w', 'utf-8') walker = sub_utters_data_walker(args.sub_utters_file) count = 0 for (pre_utter, cul_utter) in walker.ReadUtter(): for sub_tag in cul_utter['sub_tag_list']: count += 1 if count % 100 == 0: sys.stderr.write('%d\n' %(count)) extractor.ProcSent(sub_tag) if extractor.success: for (token, lemma, POStag, SemTag, BIOTag) in zip(extractor.tok_TokenList, extractor.tok_LemmaList, extractor.tok_POSList, extractor.tok_TagList, extractor.tok_BIOList): if token.istitle(): title_flag = 'Y' else: title_flag = 'N' print >>output, '%s\t%s\t%s\t%s\t%s\t%s' %(token, lemma, POStag, title_flag, SemTag, BIOTag) print >>output output.close()
def main(argv): # 读取配置文件 InitConfig() config = GetConfig() config.read([os.path.join(os.path.dirname(__file__), "../config/msiip_simple.cfg")]) # 设置logging log_level_key = config.get("logging", "level") run_code_name = os.path.basename(sys.argv[0])[0:-3] logging.basicConfig( filename=os.path.join( os.path.dirname(__file__), "../../output/logs", "%s_%s.log" % (run_code_name, time.strftime("%Y-%m-%d", time.localtime(time.time()))), ), level=GetLogLevel(log_level_key), format="%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s", ) parser = argparse.ArgumentParser(description="Extract Semantic Tag Data.") parser.add_argument("sub_utters_file", help="sub_utters_file") parser.add_argument("output", help="Output json file") args = parser.parse_args() output = codecs.open(args.output, "w", "utf-8") walker = sub_utters_data_walker(args.sub_utters_file) count = 0 sub_utters_list = [] interesting_attr = ["HOW_MUCH", "HOW_TO", "PREFERENCE", "WHAT", "WHEN", "WHERE", "WHICH"] interesting_attr_dic = {} for attr in interesting_attr: interesting_attr_dic[attr] = [] for (pre_utter, cul_utter) in walker.ReadUtter(): for i, sub_tag in enumerate(cul_utter["sub_tag_list"]): count += 1 if count % 100 == 0: sys.stderr.write("%d\n" % (count)) (token_list, _, _) = SemTagExtractor._ReadSentTags(sub_tag) if token_list: sub_utters_list.append(" ".join(token_list)) for attr in cul_utter["speech_acts"][i]["attributes"]: if attr in interesting_attr: interesting_attr_dic[attr].append(len(sub_utters_list) - 1) print "all", count for attr, attr_list in interesting_attr_dic.items(): print attr, len(attr_list) out_json = {} out_json["sub_utter_data"] = sub_utters_list out_json["attr_data_index"] = interesting_attr_dic json.dump(out_json, output, indent=4) output.close()