def parse_trace_file(f): with open(f, 'r') as reader: lines = [ tuple(l.strip().split()) for l in reader if len(l.strip()) > 0 ] traces = map(lambda tr: tr[1:] if tr[0] == lib.starting_char() else tr, lines) return set(traces)
def parse_sampled_traces(generated_traces_folder, prefix_name, method_list=None): seed_traces_files = lib.find_files_by_prefix(generated_traces_folder, prefix_name) # print seed_traces_files,generated_traces_folder print("Found", len(seed_traces_files), "training trace files") trace_set = set() method_set = set([lib.starting_char(), lib.ending_char()]) validation_traces = [] possible_ending_words = set() for f in seed_traces_files: print("Processing", f) if not is_feature_file(os.path.basename(f), prefix_name): continue print("Reading", f) with open(f, 'r') as reader: lines = [l.strip() for l in reader] word_indices = [-1] + [ i for i in range(len(lines)) if lines[i].startswith('WORD') ] find_ending_methods(possible_ending_words, lines) print("Trace length:", len(word_indices)) if len(word_indices) > max_trace_depth(): val_trace = [e.split()[-1] for e in lines if e.startswith('WORD')] while val_trace[0] == starting_char(): val_trace = val_trace[1:] if val_trace[-1] != ending_char(): val_trace += [ending_char()] validation_traces += [tuple(val_trace)] print("The trace is too long! Appended to validation data.") continue ####### one_trace = [] for i in range(1, len(word_indices)): part = lines[word_indices[i - 1] + 1:word_indices[i] + 1] one_trace += [(tuple(part[:-1]), part[-1])] method_set |= set( [word_str.split()[-1] for (_, word_str) in one_trace]) # print one_trace trace_set.add(tuple(one_trace)) print("Parsed trace set:", len(trace_set)) ############################################################################### # create legal pairs if method_list is None: method_list = sorted(list(method_set)) print( "using old method list, assuming there are no new methods in new traces" ) method2ID = {e: k for (k, e) in enumerate(method_list)} actual_next_methods = {w: [0.0 for _ in method_list] for w in method_list} for one_trace in trace_set: for i in range(1, len(one_trace)): previous_word = one_trace[i - 1][-1].split()[-1] current_word = one_trace[i][-1].split()[-1] actual_next_methods[previous_word][method2ID[current_word]] = 1.0 ############################################################################### instances = set() returned_traces = set() for one_trace in trace_set: feature_trace = [] visited_method = {w: math.log10(1e-3) for w in method_list} for (probs, word_string) in one_trace: the_word = word_string.split()[-1] feature_trace += [( tuple(map(lambda x: visited_method[x], method_list)) + parse_probs_lines(probs, method_list) # + tuple(next_word_vector) , the_word)] visited_method[the_word] = math.log10(1.0 - 1e-3) one_trace = tuple(feature_trace) #################################################################################### returned_traces.add(one_trace) instances |= set([ps for (ps, _) in one_trace]) instances = sorted(list(instances)) instances_dict = {x: str(i) for (i, x) in enumerate(instances)} indexed_traces = set() for one_trace in returned_traces: indexed_traces.add( tuple( #map(lambda (features, word): (instances_dict[features], word), one_trace) map(lambda x: (instances_dict[x[0]], x[1]), one_trace))) return instances, sorted(list(indexed_traces), key=lambda x: len(x)), set( validation_traces), method_list, possible_ending_words