Beispiel #1
0
def parse_trace_file(f):

    with open(f, 'r') as reader:

        lines = [
            tuple(l.strip().split()) for l in reader if len(l.strip()) > 0
        ]

        traces = map(lambda tr: tr[1:]
                     if tr[0] == lib.starting_char() else tr, lines)
        return set(traces)
Beispiel #2
0
def parse_sampled_traces(generated_traces_folder,
                         prefix_name,
                         method_list=None):
    seed_traces_files = lib.find_files_by_prefix(generated_traces_folder,
                                                 prefix_name)
    # print seed_traces_files,generated_traces_folder
    print("Found", len(seed_traces_files), "training trace files")
    trace_set = set()
    method_set = set([lib.starting_char(), lib.ending_char()])
    validation_traces = []
    possible_ending_words = set()
    for f in seed_traces_files:
        print("Processing", f)
        if not is_feature_file(os.path.basename(f), prefix_name):
            continue
        print("Reading", f)
        with open(f, 'r') as reader:
            lines = [l.strip() for l in reader]
        word_indices = [-1] + [
            i for i in range(len(lines)) if lines[i].startswith('WORD')
        ]
        find_ending_methods(possible_ending_words, lines)

        print("Trace length:", len(word_indices))
        if len(word_indices) > max_trace_depth():
            val_trace = [e.split()[-1] for e in lines if e.startswith('WORD')]
            while val_trace[0] == starting_char():
                val_trace = val_trace[1:]
            if val_trace[-1] != ending_char():
                val_trace += [ending_char()]
            validation_traces += [tuple(val_trace)]
            print("The trace is too long! Appended to validation data.")
            continue
        #######
        one_trace = []
        for i in range(1, len(word_indices)):
            part = lines[word_indices[i - 1] + 1:word_indices[i] + 1]
            one_trace += [(tuple(part[:-1]), part[-1])]
            method_set |= set(
                [word_str.split()[-1] for (_, word_str) in one_trace])
        # print one_trace
        trace_set.add(tuple(one_trace))
    print("Parsed trace set:", len(trace_set))
    ###############################################################################
    # create legal pairs
    if method_list is None:
        method_list = sorted(list(method_set))
        print(
            "using old method list, assuming there are no new methods in new traces"
        )

    method2ID = {e: k for (k, e) in enumerate(method_list)}
    actual_next_methods = {w: [0.0 for _ in method_list] for w in method_list}
    for one_trace in trace_set:
        for i in range(1, len(one_trace)):
            previous_word = one_trace[i - 1][-1].split()[-1]
            current_word = one_trace[i][-1].split()[-1]
            actual_next_methods[previous_word][method2ID[current_word]] = 1.0
    ###############################################################################
    instances = set()

    returned_traces = set()
    for one_trace in trace_set:
        feature_trace = []
        visited_method = {w: math.log10(1e-3) for w in method_list}
        for (probs, word_string) in one_trace:
            the_word = word_string.split()[-1]

            feature_trace += [(
                tuple(map(lambda x: visited_method[x], method_list)) +
                parse_probs_lines(probs, method_list)
                #    + tuple(next_word_vector)
                ,
                the_word)]
            visited_method[the_word] = math.log10(1.0 - 1e-3)
        one_trace = tuple(feature_trace)
        ####################################################################################
        returned_traces.add(one_trace)
        instances |= set([ps for (ps, _) in one_trace])
    instances = sorted(list(instances))
    instances_dict = {x: str(i) for (i, x) in enumerate(instances)}
    indexed_traces = set()
    for one_trace in returned_traces:
        indexed_traces.add(
            tuple(
                #map(lambda (features, word): (instances_dict[features], word), one_trace)
                map(lambda x: (instances_dict[x[0]], x[1]), one_trace)))

    return instances, sorted(list(indexed_traces), key=lambda x: len(x)), set(
        validation_traces), method_list, possible_ending_words