def normalize(target, source, env):
    """
    Takes the combined IV and OOV results
    Remove keywords not in kwlist
    NEEDS WORK!
    CONVERT TO BUILDER!
    """
    tmpfile_fid, tmpfile_name = tempfile.mkstemp()
    res_xml = et.parse(meta_open(source[0].rstr()))

    kw_ids = {(a, b.lstrip("0")) : "%s-%s" % (a, b) for a, b in [x.split("-") for x in set([x.get("kwid") for x in et.parse(meta_open(source[1].rstr())).getiterator("kw")])]}
    elems = [x for x in res_xml.getiterator("detected_termlist")] # if x.get("termid") not in kw_ids]
    for e in elems:
        a, b = e.get("termid").split("-")
        b = b.lstrip("0")
        if (a, b) not in kw_ids:
            res_xml.getroot().remove(e)
            #print kw_ids[(a, b)]
        else:
            #print kw_ids[(a, b)]
            e.set("termid", kw_ids[(a, b)])

    res_xml.write(tmpfile_name)
    stdout, stderr, success = run_command(env.subst("${PYTHON} ${F4DENORMALIZATIONPY} ${SOURCE} ${TARGET}", target=target, source=tmpfile_name))
    os.remove(tmpfile_name)
    if not success:
        print stderr
    return None
def query_to_phone_fst(target, source, env):
    """
Usage: /mnt/calculon-minor/lorelei_svn/KWS/bin64/query2phonefst [-opts] [outputdir] [querylist]                                        
-d file         dictionary file                                                                 
-s file         use external phone table specified                                              
-O file         file containing prons for oovs, output of l2s system                            
-l file         file to output list of all fsts corresponding to queries                        
-I int          ignore (print empty fst-file) if query has less than <int> phones               
-t double       if specified, tag oovs with soft threshold indicated.                           
-u              if specified, ignore weight of alternative prons                                
-g              add gamma penalty for query length p = p^gamma (gamma=1/lenght-phone)           
-w              if specified, query is represented as one arc per word, not converted to phones 
-p p2pfile      p2pfile, to allow for fuzziness in query (default:no p2p)                       
-n nbest        if p2pfile, this limits number of paths retaind after composing query with p2p  
-?              info/options
    """
    args = source[-1].read()
    try:
        os.makedirs(args["OUTDIR"])
    except:
        pass
    command = env.subst("${QUERY2PHONEFST} -p ${SOURCES[0]} -s ${SOURCES[1]} -d ${SOURCES[2]} -l ${TARGETS[0]} -n %(n)d -I %(I)d %(OUTDIR)s ${SOURCES[3]}" % args, target=target, source=source)
    #command = env.subst("${BABEL_REPO}/KWS/bin64/query2phonefst -s ${SOURCES[1]} -d ${SOURCES[2]} -l ${TARGETS[0]} -I %(I)d %(OUTDIR)s ${SOURCES[3]}" % args, target=target, source=source)
    #print command
    stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if not success:
        return stderr
    return None
def run_asr_experiment_torque(target, source, env):
    args = source[-1].read()
    construct_command = env.subst("${ATTILA_INTERPRETER} ${SOURCES[1].abspath}", source=source)
    out, err, success = run_command(construct_command)
    if not success:
        return out + err
    stdout = env.Dir(args.get("stdout", args["path"])).Dir("stdout").rstr()
    stderr = env.Dir(args.get("stderr", args["path"])).Dir("stderr").rstr()
    if not os.path.exists(stdout):
        os.makedirs(stdout)
    if not os.path.exists(stderr):
        os.makedirs(stderr)
    command = env.subst("${ATTILA_INTERPRETER} ${SOURCES[2].abspath} -n ${TORQUE_JOBS_PER_SCONS_INSTANCE} -j $${PBS_ARRAYID} -w ${ACOUSTIC_WEIGHT} -l 1", source=source)
    interval = args.get("interval", 10)
    job = torque.Job(args.get("name", "scons"),
                     commands=[command],
                     path=args["path"],
                     stdout_path=stdout,
                     stderr_path=stderr,
                     array=args.get("array", 0),
                     other=args.get("other", ["#PBS -W group_list=yeticcls"]),
                     )
    if env["HAS_TORQUE"]:
        job.submit(commit=True)
        while job.job_id in [x[0] for x in torque.get_jobs(True)]:
            logging.debug("sleeping...")
            time.sleep(interval)
    else:
        logging.info("no Torque server, but I would submit:\n%s" % (job))
    with meta_open(target[0].rstr(), "w") as ofd:
        ofd.write(time.asctime() + "\n")
    return None
def normalize_sum_to_one(target, source, env):
    """
    NEEDS WORK!
    CONVERT TO BUILDER!
    """
    stdout, stderr, success = run_command(env.subst("java -cp ${JAVA_NORM} normalization.ApplySumToOneNormalization ${SOURCE} ${TARGET}", target=target, source=source))
    if not success:
        return stderr
    return None
def merge_scores(target, source, env):
    """
    NEEDS WORK!
    CONVERT TO BUILDER!
    """
    stdout, stderr, success = run_command(env.subst("${MERGESCORESSUMPOSTNORMPL} ${SOURCES[0]}", target=target, source=source), env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}")})
    if not success:
        return stderr
    meta_open(target[0].rstr(), "w").write(stdout)
    return None
def fst_compile(target, source, env):
    """
    Compile an FST using OpenFST's binary 'fstcompile'.
    """
    command = env.subst("${FSTCOMPILE} --isymbols=${SOURCES[0]} --osymbols=${SOURCES[0]} ${SOURCES[1]}", target=target, source=source)
    stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if not success:
        return stderr
    meta_open(target[0].rstr(), "w").write(stdout)
    return None
def build_pad_fst(target, source, env):
    """
printing usage
Usage: /mnt/calculon-minor/lorelei_svn/KWS/bin64/buildpadfst [symtable_file] [output_fst_file]
    """
    command = env.subst("${BUILDPADFST} ${SOURCE} ${TARGET}", target=target, source=source)
    stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if not success:
        return stderr
    return None
def run_asr_experiment(target, source, env):
    args = source[-1].read()
    construct_command = env.subst("${ATTILA_INTERPRETER} ${SOURCES[1].abspath}", source=source)
    out, err, success = run_command(construct_command)
    if not success:
        return out + err
    command = env.subst("${ATTILA_INTERPRETER} ${SOURCES[2].abspath} -n ${LOCAL_JOBS_PER_SCONS_INSTANCE} -j %d -w ${ACOUSTIC_WEIGHT} -l 1", source=source)
    procs = [subprocess.Popen(shlex.split(command % i)) for i in range(env["LOCAL_JOBS_PER_SCONS_INSTANCE"])]
    for p in procs:
        p.wait()
    return None
def ibm_train_language_model(target, source, env):
    text_file = source[0].rstr()
    vocab_file = source[1].rstr()
    n = source[2].read()

    # first create count files
    temp_dir = tempfile.mkdtemp()
    prefix = os.path.join(temp_dir, "temp")
    cmd = "${ATTILA_PATH}/tools/lm_64/CountNGram -n %d %s %s %s" % (n, text_file, vocab_file, prefix)
    out, err, success = run_command(env.subst(cmd))

    # build LM
    lm = ".".join(target[0].rstr().split(".")[0:-2])
    cmd = "${ATTILA_PATH}/tools/lm_64/BuildNGram.sh -n %d -arpabo %s %s" % (n, prefix, lm)
    out, err, success = run_command(env.subst(cmd), env={"SFCLMTOOLS" : env.subst("${ATTILA_PATH}/tools/lm_64")})

    # clean up
    for i in range(1, n + 1):
        os.remove("%s.count.%d" % (prefix, i))
    os.remove("%s.count.check" % (prefix))
    os.rmdir(temp_dir)
    return None
def merge(target, source, env):
    """
    NEEDS WORK!
    CONVERT TO BUILDER!
    Combines the output of several searches
    input: XML files (<term>)
    output: 
    """
    args = source[-1].read()
    #stdout, stderr, success = run_command(env.subst("${BABEL_REPO}/KWS/scripts/printQueryTermList.prl -padlength=%(PADLENGTH)d ${SOURCES[0]}" % args, 
    #                                                target=target, source=source), env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}")})
    stdout, stderr, success = run_command(env.subst("${PRINTQUERYTERMLISTPRL} -prefix=KW%(LANGUAGE_ID)s- -padlength=%(PADLENGTH)d ${SOURCES[0]}" % args, 
                                                    target=target, source=source), env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}")})
    meta_open(target[0].rstr(), "w").write(stdout)
    meta_open(target[1].rstr(), "w").write("\n".join([x.rstr() for x in source[1:-1]]))
    if args["MODE"] == "merge-atwv":
        return "merge-atwv option not supported!"
    else:        
        merge_search_from_par_index = "${MERGESEARCHFROMPARINDEXPRL} -force-decision=\"YES\" ${TARGETS[0]} ${TARGETS[1]}"
        stdout, stderr, success = run_command(env.subst(merge_search_from_par_index, target=target, source=source), env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}")})
        meta_open(target[2].rstr(), "w").write(stdout)
        meta_open(target[3].rstr(), "w").write("\n".join(stdout.split("\n")))
    return None
def run_g2p(target, source, env):
    with temp_file() as tfname, meta_open(source[0].rstr()) as pl_fd:
        words = set([x.split()[0].split("(")[0] for x in pl_fd])
        with meta_open(tfname, "w") as t_fd:
            t_fd.write("\n".join(words))
        out, err, success = run_command(env.subst("%s %s/bin/g2p.py --model %s --encoding=%s --apply %s --variants-mass=%f  --variants-number=%d" % (env["PYTHON"], env["OVERLAY"], source[1].rstr(), "utf-8", tfname, .9, 4)),
                                        env={"PYTHONPATH" : env.subst("${OVERLAY}/lib/python2.7/site-packages")},
                                        )
        if not success:
            return err
        else:
            with meta_open(target[0].rstr(), "w") as out_fd:
                out_fd.write(out)
    return None
def build_index(target, source, env):
    """
    Creates an index of files listed in the input, using the IBM binary 'buildindex'.

Usage: /mnt/calculon-minor/lorelei_svn/KWS/bin64/buildindex [-opts] [lattice_list] [output_file]            
Options:                                                                 
-f file     filter fst (default : none)                                  
-p          push costs                                                   
-J int      job-batch (for parallel run)                                 
-N int      total number of jobs (for parallel run)                      
-v          (verbose) if specified all debug output is printed to stderr 
-?      help
    """
    command = env.subst("${BUILDINDEX} -p ${SOURCE} ${TARGET}", target=target, source=source)
    stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if not success:
        return stderr
    return None
def train_pronunciation_model(target, source, env):
    """
    g2p.py --train - --devel 5% --model test.model2 --ramp-up --write-model test.model3
    """
    train_fname = source[0].rstr()
    dev_percent = source[1].read()
    if len(source) == 3:
        previous = source[2].rstr()
        cmd = "${SEQUITUR_PATH}/bin/g2p.py --train - --devel %d%% --write-model %s --ramp-up --model %s" % (dev_percent, target[0].rstr(), previous)        
    else:
        cmd = "${SEQUITUR_PATH}/bin/g2p.py --train - --devel %d%% --write-model %s" % (dev_percent, target[0].rstr())
    with open(train_fname) as ifd:
        data = "\n".join([re.sub(r"^(\S+)\(\d+\) (\S+) \[ wb \] (.*) \[ wb \]$", r"\1 \2 \3", line.strip()) for line in ifd if "REJ" not in line and line[0] != "<" and "SIL" not in line])
        #print data
        out, err, success = run_command(env.subst(cmd), env={"PYTHONPATH" : env.subst("${SEQUITUR_PATH}/lib/python2.7/site-packages")}, data=data)
        if not success:
            return err
        else:
            return None
def score(target, source, env):
    """
    NEEDS WORK!
    CONVERT TO BUILDER!
    """
    args = source[-1].read()

    with temp_dir("kws_work") as work_dir, temp_dir("kws_out") as out_dir:
        cmd = env.subst("${PERL} ${F4DE}/bin/BABEL13_Scorer -XmllintBypass -sys ${SOURCE} -dbDir ${INDUS_DB} -comp %s -res %s -exp %s" % (work_dir, out_dir, args.get("EXPID", "KWS13_IBM_babel106b-v0.2g_conv-dev_BaDev_KWS_FullLP_BaseLR_NTAR_p-test-STO_1")), source=source)
        #cmd = env.subst("${F4DE}/KWSEval/BABEL/Participants/BABEL_Scorer.pl -XmllintBypass -sys ${SOURCE} -dbDir ${INDUS_DB} -comp %s -res %s -exp %s" % (work_dir, out_dir, args.get("EXPID", "KWS13_IBM_babel106b-v0.2g_conv-dev_BaDev_KWS_FullLP_BaseLR_NTAR_p-test-STO_1")), source=source)
        stdout, stderr, success = run_command(cmd, env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}"), 
                                                        "F4DE_BASE" : env.subst(env["F4DE"]),
                                                        "PERL5LIB" : env.subst("$PERL_LIBRARIES"),
                                                        "PATH" : ":".join([env.subst("${OVERLAY}/bin")] + os.environ["PATH"].split(":"))})
        if not success:
            return stderr + stdout
        else:
            shutil.rmtree(os.path.dirname(target[0].rstr()), ignore_errors=False)
            shutil.copytree(out_dir, os.path.dirname(target[0].rstr()))
    return None
    #tmpfile_fid, tmpfile_name = tempfile.mkstemp()
    

    #theargs = {}
    #theargs.update(args)
    #theargs.update({"KWS_LIST_FILE" : source[0].rstr(), "PREFIX" : tmpfile_name})
    #cmd = env.subst("${KWSEVALPL} -e %(ECF_FILE)s -r %(RTTM_FILE)s -s %(KWS_LIST_FILE)s -t ${SOURCES[1]} -o -b -f %(PREFIX)s" % theargs,
    #                source=source, target=target)                    
    #cmd = env.subst("${F4DE}/bin/BABEL13_Scorer -e %(ECF_FILE)s -r %(RTTM_FILE)s -s %(KWS_LIST_FILE)s -t ${SOURCES[1]} -o -b -f %(PREFIX)s" % theargs,
    #source=source, target=target)                    
    #stdout, stderr, success = run_command(cmd, env={"LD_LIBRARY_PATH" : env.subst("${LIBRARY_OVERLAY}"), "PERL5LIB" : env.subst("${OVERLAY}/lib/perl5/site_perl:${F4DE}/common/lib:${F4DE}/KWSEval/lib/"), "PATH" : "/usr/bin"})
    #if not success:
    #    return stderr + stdout
    #os.remove(tmpfile_name)
    #shutil.move("%s.sum.txt" % tmpfile_name, target[0].rstr())
    #shutil.move("%s.bsum.txt" % tmpfile_name, target[1].rstr())
    return None
def standard_search(target, source, env):
    """
Usage: /mnt/calculon-minor/lorelei_svn/KWS/bin64/stdsearch [-opts] [result_file] [query_file]                     
Options:                                                                        
-d file          data file [data.list] with lines in the following format :     
                 utt_name start_time fst_path (default: data.list)              
-f filt          filter fst (default: none)                                     
-i fst           index fst (default: index.fst)                                 
-n N             return N-best results (default: return all)                    
-p fst           pad fst (default: fspad.fst)                                   
-s symbols       arc symbols (default: word.list)                               
-t threshold     min score needed to decide YES,                                
                 (if specified it overrides term-spec-threshold (default))      
-T true/false    true (default)=queries are in text format, false=fst format    
                 txtformat: query list is a list of queries,                    
                 fstformat: query list is a list of full-paths to query fsts    
-J int           job-batch (for parallel run)                                   
-N int           total number of jobs (for parallel run)                        
-a string        title on results list (default: stdbn.tlist.xml)               
-b string        prefix on termid (default: TERM-0)                             
-m string        termid numerical formatting string (default: -1524500936)             
-O               if specified, don't optimize (default : optimize = true)       
-v               (verbose) if specified, print all debug outputs to stderr      
-?               info/options
    """
    data_list, isym, idx, pad, queryph = source[0:5]
    args = source[-1].read()
    if source[-2].stat().st_size == 0:
        with meta_open(target[0].rstr(), "w") as ofd:
            ofd.write("""<stdlist termlist_filename="std.xml" indexing_time="68.51" language="english" index_size="" system_id="" />\n""")
        return None
    command = env.subst("${STDSEARCH} -F ${TARGET} -i ${SOURCES[2]} -b KW%(LANGUAGE_ID)s- -s ${SOURCES[1]} -p ${SOURCES[3]} -d ${SOURCES[0]} -a %(TITLE)s -m %(PRECISION)s ${SOURCES[4]}" % args, target=target, source=source)
    stdout, stderr, success = run_command(command, env={"LD_LIBRARY_PATH" : env.subst(env["LIBRARY_OVERLAY"])}, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if not success:
        return stderr
    return None
def score_results(target, source, env):
    """
    """    
    ctm_path = source[0].rstr()
    transcript = source[1].rstr()
    out_path = os.path.dirname(target[0].rstr())

    # Get a list of IDs from the reference.  All must appear in the CTM output
    spkD = set()
    with codecs.open(transcript, "rb", encoding="utf-8") as f:
        for line in f:
            if line.startswith(";;"):
                continue
            spkD.add(line.split()[0])

    # skip eval data
    isEval = re.compile("/eval/")

    # Merge and clean up CTM
    skipD = frozenset([u"~SIL", u"<s>", u"</s>", u"<HES>", u"<hes>"])
    ctmL = []
    for file_ in glob(pjoin(ctm_path, "*.ctm")):
        with codecs.open(file_, "rb", encoding="utf-8") as ctmF:
            for line in ctmF:
                uttid, pcm, beg, dur, token = line.split()
                if isEval.search(pcm):
                    continue
                token = token[:-4]
                if token in skipD:
                    continue
                idx = uttid.find("#")
                spk = uttid[:idx]
                spkD.discard(spk)
                ctmL.append((spk, float(beg), dur, token))
    ctmL.sort()

    # add in missing speakers
    for spk in spkD:
        bisect.insort(ctmL, (spk, 0.0, "0.0", "@"))

    with codecs.open(pjoin(out_path, "all.ctm"), "wb", encoding="utf-8") as outF:
        for ctm in sorted(ctmL):
            outF.write("%s 1 %7.3f %s %s\n" % ctm)

    args = {"SCLITE" : env["SCLITE_BINARY"],
            "TRANSCRIPT" : transcript,
            "TRANSCRIPT_FORMAT" : "stm",
            "HYPOTHESIS" : os.path.abspath(pjoin(out_path, "all.ctm")),
            "HYPOTHESIS_FORMAT" : "ctm",
            "ENCODING" : "utf-8",
            "OUTPUT_NAME" : "babel",
            "OUTPUT_ROOT" : os.path.abspath(out_path),
            "OUTPUT_TYPES" : "all dtl sgml",
            }

    # Run scoring
    cmd =env.subst("%(SCLITE)s -r %(TRANSCRIPT)s %(TRANSCRIPT_FORMAT)s -O %(OUTPUT_ROOT)s -h %(HYPOTHESIS)s %(HYPOTHESIS_FORMAT)s -n %(OUTPUT_NAME)s -o %(OUTPUT_TYPES)s -e %(ENCODING)s -D -F" % args)
    out, err, success = run_command(cmd)
    if not success:
        return out + err
    return None