def run(f1_n,f2_n): t1 = codeParser(f1_n) t2 = codeParser(f2_n) vars1 = t1.get_varTypes() vars2 = t2.get_varTypes() t = aligner(vars1,vars2) return t.align()
def process_files(in_dir): global WORDS, TRAIN WORDS = in_dir+'words' TRAIN = in_dir+'train.dat' CORRECT_LIST = [] matches = [] #CORRECT_LIST = (open(in_dir+'train_corr','r')).readlines() for line in (open(in_dir+'train_corr','r')).readlines(): if line.replace('\n','') != '': CORRECT_LIST.append(line.replace('\n','')) matches.append(line.replace('\n','')) #print CORRECT_LIST for line in open(in_dir+'train_wrong','r').readlines(): if line.replace('\n','') != '': matches.append(line.replace('\n','')) #print matches #matches = (open(in_dir+'train_corr','r')).readlines() + (open(in_dir+'train_wrong','r')).readlines() words_per_file = dict() words = set() length = str(len(matches)) count = 0 user_counter = 0 # first iteration through matches to get all words for match in matches: count +=1 print str(count) + " of " + length + " training example preprocessing done" t = codeParser(match) #if 'tweet' in get_all_words(match) : # user_counter += 1 # print user_counter words_per_file[match] = t.get_compressed() words = words.union(words_per_file[match]) print "preliminary processing done" train_f = open(TRAIN, 'w') # second iteration through matches to get all word counts count = 0 calc_freq = tfidf(words, words_per_file) print "Done calculating idfs" #print "PARAM " + str(calc_freq.getIDF('@param')) for match in matches: count +=1 print str(count) + " of " + length + " training examples done" if match in CORRECT_LIST: #print calc_freq.term_freq('tweet',words_per_file[match]) train_f.write('+1') #print len(words_per_file[match]) else: train_f.write('-1') train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq))) train_f.write('\n') train_f.close() print "Results printed to file: " + str(TRAIN)
def run_varObj(f1_n): t = codeParser(f1_n) print "Testing " + f1_n + ' \n' counter = 1 for var in t.get_listOf_variableObj(): print '\n' print "Variable " + str(counter) + ':' print 'Type: ' + var.get_type() print 'Name: ' + var.get_name() print 'Declaration: ' + var.get_declaration() print 'Usage: ' + str(var.get_usage()) counter += 1
def __init__(self,input_f1, input_f2): self.vars_1 = (codeParser(input_f1)).get_varTypes() self.vars_2 = (codeParser(input_f2)).get_varTypes() self.vars_list_1 = (codeParser(input_f1)).get_listOf_variableObj() self.vars_list_2 = (codeParser(input_f2)).get_listOf_variableObj() self.method_list_1 = (codeParser(input_f1)).get_listOf_methodObj() self.method_list_2 = (codeParser(input_f2)).get_listOf_methodObj() self.lines_1 = open(input_f1,'r').readlines() self.lines_2 = open(input_f2,'r').readlines()
#! /usr/local/bin/python from codeParser import codeParser f1_n = '../../code_corpus/search_code/cat_sm' #f1_n = '../raw_code/test_1' #f1_n = '../../code_corpus/regular_code/XmlResponsesSaxParser.java_30_code_mod' t = codeParser(f1_n) print t.get_compressed()
#! /usr/local/bin/python from codeParser import codeParser from aligner import aligner f1_n = '../raw_code/test_1' f2_n = '../raw_code/test_2' #f1_n = '../../code_corpus/regular_code/XmlResponsesSaxParser.java_30_code_mod' t1 = codeParser(f1_n) t2 = codeParser(f2_n) vars1 = t1.get_varTypes() vars2 = t2.get_varTypes() t = aligner(vars1,vars2) print t.align()
def process_files(in_dir): words_per_file = dict() t_words_per_file = dict() '''for root, dirnames, filenames in os.walk(DIR_NAME): for filename in fnmatch.filter(filenames, MATCH): matches.append(os.path.join(root, filename)) ''' global WORDS, TRAIN WORDS = in_dir+'words' TEST = in_dir+'test.dat' ## SHOULD TFIDF be calc adding in test_data? TRAIN_SET = [] for line in (open(in_dir+'train_corr','r')).readlines(): if line.replace('\n','') != '': TRAIN_SET.append(line.replace('\n','')) for line in (open(in_dir+'train_wrong','r')).readlines(): if line.replace('\n','') != '': TRAIN_SET.append(line.replace('\n','')) for t_match in TRAIN_SET: t = codeParser(t_match) t_words_per_file[t_match] = t.get_compressed() CORRECT_LIST = [] matches = [] for line in (open(in_dir+'test_corr','r')).readlines(): if line.replace('\n','') != '': CORRECT_LIST.append(line.replace('\n','')) matches.append(line.replace('\n','')) for line in open(in_dir+'test_wrong','r').readlines(): if line.replace('\n','') != '': matches.append(line.replace('\n','')) words = get_all_words_file() length = str(len(matches)) count = 0 # first iteration through matches to get all words for match in matches: count +=1 t = codeParser(t_match) print str(count) + " of " + length + " test example preprocessing done" words_per_file[match] = t.get_compressed() print "preliminary processing done" train_f = open(TEST, 'w') # second iteration through matches to get all word counts count = 0 # TODO: should this be t_words_per_file, words_per_file or the combination??? calc_freq = tfidf(words, t_words_per_file) print "Done calculating idfs" #print "PARAM " + str(calc_freq.getIDF('@param')) for match in matches: count +=1 print str(count) + " of " + length + " testing examples done" if match in CORRECT_LIST: train_f.write('+1') else: train_f.write('-1') train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq))) train_f.write('\n') train_f.close() print "Results printed to file: " + str(TEST)