def main(argv):
	# 读取配置文件
	InitConfig()
	config = GetConfig()
	config.read([os.path.join(os.path.dirname(__file__),'../config/msiip_simple.cfg')])

	# 设置logging
	log_level_key = config.get('logging','level')
	run_code_name = os.path.basename(sys.argv[0])[0:-3]
	logging.basicConfig(filename = os.path.join(os.path.dirname(__file__), '../../output/logs', '%s_%s.log' %(run_code_name,time.strftime('%Y-%m-%d',time.localtime(time.time())))), \
    					level = GetLogLevel(log_level_key), 
    					format = '%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s')
	
	parser = argparse.ArgumentParser(description='Extract Semantic Tag Data.')
	parser.add_argument('sub_utters_file', help='sub_utters_file')
	parser.add_argument('output', help='Output file')

	args = parser.parse_args()

	extractor = SemTagExtractor()
	output = codecs.open(args.output, 'w', 'utf-8')

	walker = sub_utters_data_walker(args.sub_utters_file)
	count = 0
	for (pre_utter, cul_utter) in walker.ReadUtter():
		for sub_tag in cul_utter['sub_tag_list']:
			count += 1
			if count % 100 == 0:
				sys.stderr.write('%d\n' %(count))
			extractor.ProcSent(sub_tag)
			if extractor.success:
				for (token, lemma, POStag, SemTag, BIOTag) in zip(extractor.tok_TokenList, extractor.tok_LemmaList, extractor.tok_POSList, extractor.tok_TagList, extractor.tok_BIOList):
					if token.istitle():
						title_flag = 'Y'
					else:
						title_flag = 'N'
					print >>output, '%s\t%s\t%s\t%s\t%s\t%s' %(token, lemma, POStag, title_flag, SemTag, BIOTag)
				print >>output
	output.close()
def main(argv):
    # 读取配置文件
    InitConfig()
    config = GetConfig()
    config.read([os.path.join(os.path.dirname(__file__), "../config/msiip_simple.cfg")])

    # 设置logging
    log_level_key = config.get("logging", "level")
    run_code_name = os.path.basename(sys.argv[0])[0:-3]
    logging.basicConfig(
        filename=os.path.join(
            os.path.dirname(__file__),
            "../../output/logs",
            "%s_%s.log" % (run_code_name, time.strftime("%Y-%m-%d", time.localtime(time.time()))),
        ),
        level=GetLogLevel(log_level_key),
        format="%(asctime)s %(levelname)8s %(lineno)4d %(module)s:%(name)s.%(funcName)s: %(message)s",
    )

    parser = argparse.ArgumentParser(description="Extract Semantic Tag Data.")
    parser.add_argument("sub_utters_file", help="sub_utters_file")
    parser.add_argument("output", help="Output json file")

    args = parser.parse_args()

    output = codecs.open(args.output, "w", "utf-8")

    walker = sub_utters_data_walker(args.sub_utters_file)

    count = 0
    sub_utters_list = []

    interesting_attr = ["HOW_MUCH", "HOW_TO", "PREFERENCE", "WHAT", "WHEN", "WHERE", "WHICH"]

    interesting_attr_dic = {}

    for attr in interesting_attr:
        interesting_attr_dic[attr] = []

    for (pre_utter, cul_utter) in walker.ReadUtter():
        for i, sub_tag in enumerate(cul_utter["sub_tag_list"]):
            count += 1
            if count % 100 == 0:
                sys.stderr.write("%d\n" % (count))
            (token_list, _, _) = SemTagExtractor._ReadSentTags(sub_tag)
            if token_list:
                sub_utters_list.append(" ".join(token_list))

                for attr in cul_utter["speech_acts"][i]["attributes"]:
                    if attr in interesting_attr:
                        interesting_attr_dic[attr].append(len(sub_utters_list) - 1)

    print "all", count
    for attr, attr_list in interesting_attr_dic.items():
        print attr, len(attr_list)

    out_json = {}
    out_json["sub_utter_data"] = sub_utters_list
    out_json["attr_data_index"] = interesting_attr_dic
    json.dump(out_json, output, indent=4)
    output.close()