def search_queries(I, queries, lines, output_file): # initialize the output doc creating the root attrs = OrderedDict() attrs['kwlist_filename'] = 'IARPA-babel202b-v1.0d_conv-dev.kwlist.xml' attrs['language'] = 'swahili' attrs['system_id'] = '' root = ET.Element('kwslist', attrs) # open query file and get all the hits (over all queries) doc = ET.parse(queries) kws = doc.getroot().findall('kw') # for each hit in the query file for kw in kws: # get id and text (split in words and save in a list q) of the query kwid = kw.get('kwid') q = re.split('\s+', kw.find('kwtext').text) # ensure all words in query are lowercase q = [q[i].lower() for i in range(len(q))] # if the first word is in the transcription, then search for the whole query if q[0] in I: root, detected_kwsl = kw_detected(root, kwid) # get info of current word qlen = len(q) # check all occurrences of the first word in the query for i in I[q[0]]: # check if query corresponds to current block in reference and time intervals are valid if match_query(lines, i, qlen, q) and valid_time_gap( lines, i, qlen): firstinfo = re.split('\s+', lines[i]) lastinfo = re.split('\s+', lines[i + qlen - 1]) durs = [ float(re.split('\s+', lines[x])[3]) for x in range(i, i + qlen) ] durtot = sum(durs) scores = [ float(re.split('\s+', lines[x])[5]) for x in range(i, i + qlen) ] # multiply the score of the words in the query finalscore = reduce(operator.mul, scores, 1) info = OrderedDict() info['file'] = firstinfo[0] info['channel'] = firstinfo[1] info['tbeg'] = firstinfo[2] info['dur'] = str(round(durtot, 2)) info['score'] = str(finalscore) info['decision'] = 'YES' root, detected_kwsl = append_query_result( root, detected_kwsl, info) outdoc = ET.ElementTree(root) return outdoc
if len(sys.argv)<3: raise RuntimeError,'Run script as:\n\tpython scoreNormalization.py path_to/input_file.xml path_to/output_file.xml [gamma]' # gamma can be tuned gamma = 1 input_file = sys.argv[1] output_file = sys.argv[2] if len(sys.argv)>3: gamma = float(sys.argv[3]) # sum over all hits of a query: # open input file with original scores doc = ET.parse(input_file) detected_kwlists = doc.getroot().findall('detected_kwlist') # for each query detected in the file for dkw in detected_kwlists: # get all the hits and sum of all their scores kws = dkw.findall('kw') sum_scores = sum([pow(float(kw.get('score')),gamma) for kw in kws]) # for each hit update the score by dividing by the sum of the scores for kw in kws: att['file'] = kw.attrib['file'] att['channel'] = kw.attrib['channel'] att['tbeg'] = kw.attrib['tbeg'] att['dur'] = kw.attrib['dur'] att['score'] = new_score new_score = str(pow(float(kw.attrib['score']),gamma)/sum_scores) att['decision'] = kw.attrib['decision']
indent(elem, level + 1) if not elem.tail or not elem.tail.strip(): elem.tail = i else: if level and (not elem.tail or not elem.tail.strip()): elem.tail = i ''' ----------------- MAIN ----------------- ''' file1 = sys.argv[1] file2 = sys.argv[2] output_file = sys.argv[3] # load file for system2 doc1 = ET.parse(file1) detected_kwl_1 = doc1.getroot() # load file for system1 tree2 = ET.parse(file2) detected_kwl_2 = tree2.getroot() # get all the queries in system2 queries_2 = detected_kwl_2.findall('detected_kwlist') kwids = [kw.get('kwid') for kw in queries_2] # for each query in system1 find the one with same kwid in system2 for query_1 in detected_kwl_1: kwid = query_1.get("kwid") query_2 = []
input_queries = sys.argv[1] ref = sys.argv[2] output_queries = sys.argv[3] #print input_queries, ref, output_queries # generate iv dictionary from the transcription ref IV = iv_dict(ref) # load the graphemic mapping and build the grapheme-confusion matrix CM grph_map = 'lib/kws/grapheme.map' with open(grph_map, 'r') as f: lines_map = f.readlines() CM = generate_CM(lines_map) # get all the hits of all the queries from the query file doc = ET.parse(input_queries) kws = doc.getroot().findall('kw') # keep track of an OOV dictionary of the oov words you already encountered # it will contain, for all the oov words, the closest iv word and the distance OOV = {} # for each query in the file for kw in kws: kwtext = re.split('\s+', kw.find('kwtext').text) for i in range(len(kwtext)): w = kwtext[i] # check only the oov if w not in IV: # if w is already seen in this run we already have the info if w in OOV:
import myetree.ElementTree as ET import re import sys query_file = 'lib/kws/queries.xml' outmap_file = 'querylength.map' doc = ET.parse(query_file) kws = doc.getroot().findall('kw') # build a dictionary to store the number (cont) of queries of length n: # counter[n] = cont counter = {} with open(outmap_file, 'w') as f: # for each query in the file for kw in kws: # get the query id: KW202-id idx = re.split('-', kw.get('kwid'))[-1] # load the list of words query = [x.lower() for x in re.split('\s+', kw.find('kwtext').text)] # evaluate the number of words n = len(query) if n not in counter: counter[n] = 0 counter[n] += 1 line = ' '.join([str(n), str(idx), str(counter[n])]) f.write(line + '\n') print 'counter', counter
if error: raise RuntimeError,'Run script as:\n\tpython morpoDecomposition.py path_to/input_file.{ctm, xml} path_to/dict_file.dct path_to/output_file.{ctm, xml}' # read morpological file and build dictionary d = open(dct) dct_lines = d.readlines() d.close() D = make_dict(dct_lines) extension = input_f.split('.')[-1] if extension=='xml': # ---- file xml: it's the query.xml # open query file and get all hits for all queries doc = ET.parse(input_f) kws = doc.getroot().findall('kw') # for each hit in the file for kw in kws: kwtext = re.split('\s+', kw.find('kwtext').text) text = '' # split each word for w in kwtext: # get decomposition for w from morphological dictionary D decomposition = D[w] # update text of the tree node for s in decomposition: text += s+' ' # remove (eventual) last space if text[-1]==' ':