Example #1
0
def run(f1_n,f2_n):
	t1 = codeParser(f1_n)
	t2 = codeParser(f2_n)

	vars1 = t1.get_varTypes()
	vars2 = t2.get_varTypes()


	t = aligner(vars1,vars2)
	return t.align()
Example #2
0
def process_files(in_dir):
	
	global WORDS, TRAIN
	
	WORDS = in_dir+'words'
	TRAIN = in_dir+'train.dat'
	
	CORRECT_LIST = []
	matches = []
	#CORRECT_LIST = (open(in_dir+'train_corr','r')).readlines()
	for line in (open(in_dir+'train_corr','r')).readlines():
		if line.replace('\n','') != '':
			CORRECT_LIST.append(line.replace('\n',''))
			matches.append(line.replace('\n',''))
	#print CORRECT_LIST
	for line in open(in_dir+'train_wrong','r').readlines():
                if line.replace('\n','') != '':
			matches.append(line.replace('\n',''))
	#print matches
	#matches = (open(in_dir+'train_corr','r')).readlines() + (open(in_dir+'train_wrong','r')).readlines()
	words_per_file = dict()
		
	words = set()
	length = str(len(matches))
	count = 0
	user_counter = 0 
	# first iteration through matches to get all words
	for match in matches:
		count +=1
		print str(count) + " of " + length + " training example preprocessing done"
		t = codeParser(match)
		#if 'tweet' in get_all_words(match) :
		#	user_counter += 1
		#	print user_counter
		words_per_file[match] = t.get_compressed()
		words = words.union(words_per_file[match])
	
	print "preliminary processing done"
	train_f = open(TRAIN, 'w')

	# second iteration through matches to get all word counts
	count = 0
	calc_freq = tfidf(words, words_per_file)
	print "Done calculating idfs"	
	#print "PARAM " + str(calc_freq.getIDF('@param'))
	for match in matches:
		count +=1
		print str(count) + " of " + length + " training examples done"
	
		if match in CORRECT_LIST:
			#print calc_freq.term_freq('tweet',words_per_file[match])
			train_f.write('+1')
			#print len(words_per_file[match])
		else:
			train_f.write('-1')
		
		train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq)))
		train_f.write('\n')
	train_f.close()	
	print "Results printed to file: " + str(TRAIN)
Example #3
0
def run_varObj(f1_n):
	t = codeParser(f1_n)
	print "Testing " + f1_n + ' \n'
	counter = 1
	for var in  t.get_listOf_variableObj():
		print '\n'
		print "Variable " + str(counter) + ':'
		print 'Type: ' + var.get_type()
		print 'Name: ' + var.get_name()
		print 'Declaration: ' + var.get_declaration()
		print 'Usage: ' + str(var.get_usage())
		counter += 1
Example #4
0
	def __init__(self,input_f1, input_f2):
		self.vars_1 = (codeParser(input_f1)).get_varTypes()
                self.vars_2 = (codeParser(input_f2)).get_varTypes()
		

		self.vars_list_1 = (codeParser(input_f1)).get_listOf_variableObj()
		self.vars_list_2 = (codeParser(input_f2)).get_listOf_variableObj()
		self.method_list_1 = (codeParser(input_f1)).get_listOf_methodObj()
		self.method_list_2 = (codeParser(input_f2)).get_listOf_methodObj()
		self.lines_1 = open(input_f1,'r').readlines()
		self.lines_2 = open(input_f2,'r').readlines()
#! /usr/local/bin/python

from codeParser import codeParser
f1_n = '../../code_corpus/search_code/cat_sm'	
#f1_n = '../raw_code/test_1'
#f1_n = '../../code_corpus/regular_code/XmlResponsesSaxParser.java_30_code_mod'
t = codeParser(f1_n)

print t.get_compressed()

Example #6
0
#! /usr/local/bin/python

from codeParser import codeParser
from aligner import aligner

f1_n = '../raw_code/test_1'
f2_n = '../raw_code/test_2'

#f1_n = '../../code_corpus/regular_code/XmlResponsesSaxParser.java_30_code_mod'
t1 = codeParser(f1_n)
t2 = codeParser(f2_n)

vars1 = t1.get_varTypes()
vars2 = t2.get_varTypes()


t = aligner(vars1,vars2)
print t.align()
Example #7
0
def process_files(in_dir):
        
	words_per_file = dict()
        t_words_per_file = dict()
	'''for root, dirnames, filenames in os.walk(DIR_NAME):
                for filename in fnmatch.filter(filenames, MATCH):
                        matches.append(os.path.join(root, filename))
	'''
	global WORDS, TRAIN

        WORDS = in_dir+'words'
        TEST = in_dir+'test.dat'
	
	## SHOULD TFIDF be calc adding in test_data?
	TRAIN_SET = []
	for line in (open(in_dir+'train_corr','r')).readlines():
        	if line.replace('\n','') != '':
        	        TRAIN_SET.append(line.replace('\n',''))
	for line in (open(in_dir+'train_wrong','r')).readlines():
         	if line.replace('\n','') != '':
        	        TRAIN_SET.append(line.replace('\n',''))

	for t_match in TRAIN_SET:
                t = codeParser(t_match)
		t_words_per_file[t_match] = t.get_compressed()
	

        CORRECT_LIST = []
        matches = []
        for line in (open(in_dir+'test_corr','r')).readlines():
                if line.replace('\n','') != '':
                        CORRECT_LIST.append(line.replace('\n',''))
                        matches.append(line.replace('\n',''))
        for line in open(in_dir+'test_wrong','r').readlines():
                if line.replace('\n','') != '':
                        matches.append(line.replace('\n',''))

	words = get_all_words_file()
	length = str(len(matches))
	count = 0
	
	# first iteration through matches to get all words
	for match in matches:
		count +=1
		t = codeParser(t_match)
		print str(count) + " of " + length + " test example preprocessing done"	
		words_per_file[match] = t.get_compressed()
	print "preliminary processing done"	
	train_f = open(TEST, 'w')
	# second iteration through matches to get all word counts
	count = 0
	# TODO: should this be t_words_per_file, words_per_file or the combination???
	calc_freq = tfidf(words, t_words_per_file)
	print "Done calculating idfs"	
	#print "PARAM " + str(calc_freq.getIDF('@param'))
	for match in matches:
		count +=1
		print str(count) + " of " + length + " testing examples done"

		if match in CORRECT_LIST:
			train_f.write('+1')
		else:
			train_f.write('-1')
		
		train_f.write(str(get_word_counts(words_per_file[match], words, calc_freq)))
		train_f.write('\n')
	
	train_f.close()	
	print "Results printed to file: " + str(TEST)