def stripCommentsAndStrings(source):
	result = commentIdentify.guessTokens(source)

	tokens = {}
	for start, end in result[0]:
		tokens[start] = end
	# for each line, we look to see if it begins with a comment start token
	for line in range(len(source)):
		processedLine = source[line].strip().split(" ")

		if processedLine == []:
			continue

		if tokens.has_key(processedLine[0]):
			i = line-1
			#now we look for the end token
			while i < len(source):
				i += 1
				endToken =  tokens[processedLine[0]]
				loc = source[i].find(endToken)
					
				#remove a line if the end token is not found, otherwise end the loop
				if loc == -1:
					source[i] = ""
					continue
				else:
					source[i] = source[i][loc:]
					i = len(source)
					break
				
	tokens = []
	for i in result[1]:
		tokens.append(i)
	# for each line, we look to see if it begins with a comment start token
	for line in range(len(source)):
		processedLine = source[line].strip().split(" ")
		if processedLine == []:
			continue

		for tok in tokens:
			if source[line].find(tok) != -1:		
				source[line] = source[line][:source[line].find(tok)]

	tokens = []
	for i in result[2]:
		tokens.append(i)
	# for each line, we look to see if it begins with a comment start token
	for line in range(len(source)):
		if tokens == []:
			break
		for tok in tokens:
			startLoc = source[line].find(tok)
			while startLoc != -1:
				endLoc = source[line].rfind(tok)
				#if the token is common enough to be a likely candidate, delete the string
				source[line] = source[line][:startLoc] + source[line][endLoc+1:]
				startLoc = source[line].find(tok)

	return source
コード例 #2
0
def identifyCommentAndString(languages, source):
	scores = {}

	#parse source
	num_lines = float(len(source))

	result = commentIdentify.guessTokens(source)
	for lang in languages:
		#read database for each language into memory
		databasefile = open('./database/'+lang+'/lineComments.txt', 'r')
		lineCommentlines = databasefile.readlines()
		databasefile.close()
		databasefile = open('./database/'+lang+'/blockComments.txt', 'r')
		blockCommentlines = databasefile.readlines()
		databasefile.close()
		databasefile = open('./database/'+lang+'/strings.txt', 'r')
		stringlines = databasefile.readlines()
		databasefile.close()
		
		database_linecomments = []
		
		#create databases
		for line in lineCommentlines:
			database_linecomments.append([])
			database_linecomments[-1].append(int(line.strip().split(" ")[-1]))
			database_linecomments[-1].append(line.split(" ")[0])
		database_linecomments.sort()

		database_blockcomments = []
		
		
		for line in blockCommentlines:
			database_blockcomments.append([])
			database_blockcomments[-1].append(int(line.strip().split(" ")[-1]))
			database_blockcomments[-1].append([line.split(" ")[0], line.split(" ")[1]])
		database_blockcomments.sort()

		database_strings = []
		
		
		for line in stringlines:
			database_strings.append([])
			database_strings[-1].append(int(line.strip().split(" ")[-1]))
			database_strings[-1].append(line.split(" ")[0])
		database_strings.sort()

		#lang_score gets 1 'point' every time you find a match in terms of comment and string tokens
		i = -1
		lang_score = 0
		while i > -1 - commentsNum  and i*-1 <= len(database_blockcomments):			
			for start, end in result[0]:
				if database_blockcomments[i][1][0] == start and database_blockcomments[i][1][0] == end:
					lang_score += 1
			i -= 1
		j = i
		i = -1
		while i > -1 - commentsNum  and i*-1 <= len(database_linecomments):			
			for tok in result[1]:
				if database_linecomments[i][1] == tok:
					lang_score += 1
			i -= 1
		j += i
		i = -1
		while i > -1 - commentsNum  and i*-1 <= len(database_strings):			
			for tok in result[2]:
				if database_strings[i][1] == tok:
					lang_score += 1
			i -= 1
		j += i

		lang_score *= j/((commentsNum+1)*3)
		if lang_score == 0:
			lang_score = 0.0000000000000000001
		scores[lang] = lang_score
		

	summed_scores = 0
	for lang in languages:
		summed_scores += scores[lang]

	for lang in languages:
		try:
			scores[lang] /= summed_scores
		except ZeroDivisionError:
			scores[lang] = 0

	return scores
コード例 #3
0
def identifyCommentAndString(languages, source):
    scores = {}

    #parse source
    num_lines = float(len(source))

    result = commentIdentify.guessTokens(source)
    for lang in languages:
        #read database for each language into memory
        databasefile = open('./database/' + lang + '/lineComments.txt', 'r')
        lineCommentlines = databasefile.readlines()
        databasefile.close()
        databasefile = open('./database/' + lang + '/blockComments.txt', 'r')
        blockCommentlines = databasefile.readlines()
        databasefile.close()
        databasefile = open('./database/' + lang + '/strings.txt', 'r')
        stringlines = databasefile.readlines()
        databasefile.close()

        database_linecomments = []

        #create databases
        for line in lineCommentlines:
            database_linecomments.append([])
            database_linecomments[-1].append(int(line.strip().split(" ")[-1]))
            database_linecomments[-1].append(line.split(" ")[0])
        database_linecomments.sort()

        database_blockcomments = []

        for line in blockCommentlines:
            database_blockcomments.append([])
            database_blockcomments[-1].append(int(line.strip().split(" ")[-1]))
            database_blockcomments[-1].append(
                [line.split(" ")[0], line.split(" ")[1]])
        database_blockcomments.sort()

        database_strings = []

        for line in stringlines:
            database_strings.append([])
            database_strings[-1].append(int(line.strip().split(" ")[-1]))
            database_strings[-1].append(line.split(" ")[0])
        database_strings.sort()

        #lang_score gets 1 'point' every time you find a match in terms of comment and string tokens
        i = -1
        lang_score = 0
        while i > -1 - commentsNum and i * -1 <= len(database_blockcomments):
            for start, end in result[0]:
                if database_blockcomments[i][1][
                        0] == start and database_blockcomments[i][1][0] == end:
                    lang_score += 1
            i -= 1
        j = i
        i = -1
        while i > -1 - commentsNum and i * -1 <= len(database_linecomments):
            for tok in result[1]:
                if database_linecomments[i][1] == tok:
                    lang_score += 1
            i -= 1
        j += i
        i = -1
        while i > -1 - commentsNum and i * -1 <= len(database_strings):
            for tok in result[2]:
                if database_strings[i][1] == tok:
                    lang_score += 1
            i -= 1
        j += i

        lang_score *= j / ((commentsNum + 1) * 3)
        if lang_score == 0:
            lang_score = 0.0000000000000000001
        scores[lang] = lang_score

    summed_scores = 0
    for lang in languages:
        summed_scores += scores[lang]

    for lang in languages:
        try:
            scores[lang] /= summed_scores
        except ZeroDivisionError:
            scores[lang] = 0

    return scores