def stripCommentsAndStrings(source): result = commentIdentify.guessTokens(source) tokens = {} for start, end in result[0]: tokens[start] = end # for each line, we look to see if it begins with a comment start token for line in range(len(source)): processedLine = source[line].strip().split(" ") if processedLine == []: continue if tokens.has_key(processedLine[0]): i = line-1 #now we look for the end token while i < len(source): i += 1 endToken = tokens[processedLine[0]] loc = source[i].find(endToken) #remove a line if the end token is not found, otherwise end the loop if loc == -1: source[i] = "" continue else: source[i] = source[i][loc:] i = len(source) break tokens = [] for i in result[1]: tokens.append(i) # for each line, we look to see if it begins with a comment start token for line in range(len(source)): processedLine = source[line].strip().split(" ") if processedLine == []: continue for tok in tokens: if source[line].find(tok) != -1: source[line] = source[line][:source[line].find(tok)] tokens = [] for i in result[2]: tokens.append(i) # for each line, we look to see if it begins with a comment start token for line in range(len(source)): if tokens == []: break for tok in tokens: startLoc = source[line].find(tok) while startLoc != -1: endLoc = source[line].rfind(tok) #if the token is common enough to be a likely candidate, delete the string source[line] = source[line][:startLoc] + source[line][endLoc+1:] startLoc = source[line].find(tok) return source
def identifyCommentAndString(languages, source): scores = {} #parse source num_lines = float(len(source)) result = commentIdentify.guessTokens(source) for lang in languages: #read database for each language into memory databasefile = open('./database/'+lang+'/lineComments.txt', 'r') lineCommentlines = databasefile.readlines() databasefile.close() databasefile = open('./database/'+lang+'/blockComments.txt', 'r') blockCommentlines = databasefile.readlines() databasefile.close() databasefile = open('./database/'+lang+'/strings.txt', 'r') stringlines = databasefile.readlines() databasefile.close() database_linecomments = [] #create databases for line in lineCommentlines: database_linecomments.append([]) database_linecomments[-1].append(int(line.strip().split(" ")[-1])) database_linecomments[-1].append(line.split(" ")[0]) database_linecomments.sort() database_blockcomments = [] for line in blockCommentlines: database_blockcomments.append([]) database_blockcomments[-1].append(int(line.strip().split(" ")[-1])) database_blockcomments[-1].append([line.split(" ")[0], line.split(" ")[1]]) database_blockcomments.sort() database_strings = [] for line in stringlines: database_strings.append([]) database_strings[-1].append(int(line.strip().split(" ")[-1])) database_strings[-1].append(line.split(" ")[0]) database_strings.sort() #lang_score gets 1 'point' every time you find a match in terms of comment and string tokens i = -1 lang_score = 0 while i > -1 - commentsNum and i*-1 <= len(database_blockcomments): for start, end in result[0]: if database_blockcomments[i][1][0] == start and database_blockcomments[i][1][0] == end: lang_score += 1 i -= 1 j = i i = -1 while i > -1 - commentsNum and i*-1 <= len(database_linecomments): for tok in result[1]: if database_linecomments[i][1] == tok: lang_score += 1 i -= 1 j += i i = -1 while i > -1 - commentsNum and i*-1 <= len(database_strings): for tok in result[2]: if database_strings[i][1] == tok: lang_score += 1 i -= 1 j += i lang_score *= j/((commentsNum+1)*3) if lang_score == 0: lang_score = 0.0000000000000000001 scores[lang] = lang_score summed_scores = 0 for lang in languages: summed_scores += scores[lang] for lang in languages: try: scores[lang] /= summed_scores except ZeroDivisionError: scores[lang] = 0 return scores
def identifyCommentAndString(languages, source): scores = {} #parse source num_lines = float(len(source)) result = commentIdentify.guessTokens(source) for lang in languages: #read database for each language into memory databasefile = open('./database/' + lang + '/lineComments.txt', 'r') lineCommentlines = databasefile.readlines() databasefile.close() databasefile = open('./database/' + lang + '/blockComments.txt', 'r') blockCommentlines = databasefile.readlines() databasefile.close() databasefile = open('./database/' + lang + '/strings.txt', 'r') stringlines = databasefile.readlines() databasefile.close() database_linecomments = [] #create databases for line in lineCommentlines: database_linecomments.append([]) database_linecomments[-1].append(int(line.strip().split(" ")[-1])) database_linecomments[-1].append(line.split(" ")[0]) database_linecomments.sort() database_blockcomments = [] for line in blockCommentlines: database_blockcomments.append([]) database_blockcomments[-1].append(int(line.strip().split(" ")[-1])) database_blockcomments[-1].append( [line.split(" ")[0], line.split(" ")[1]]) database_blockcomments.sort() database_strings = [] for line in stringlines: database_strings.append([]) database_strings[-1].append(int(line.strip().split(" ")[-1])) database_strings[-1].append(line.split(" ")[0]) database_strings.sort() #lang_score gets 1 'point' every time you find a match in terms of comment and string tokens i = -1 lang_score = 0 while i > -1 - commentsNum and i * -1 <= len(database_blockcomments): for start, end in result[0]: if database_blockcomments[i][1][ 0] == start and database_blockcomments[i][1][0] == end: lang_score += 1 i -= 1 j = i i = -1 while i > -1 - commentsNum and i * -1 <= len(database_linecomments): for tok in result[1]: if database_linecomments[i][1] == tok: lang_score += 1 i -= 1 j += i i = -1 while i > -1 - commentsNum and i * -1 <= len(database_strings): for tok in result[2]: if database_strings[i][1] == tok: lang_score += 1 i -= 1 j += i lang_score *= j / ((commentsNum + 1) * 3) if lang_score == 0: lang_score = 0.0000000000000000001 scores[lang] = lang_score summed_scores = 0 for lang in languages: summed_scores += scores[lang] for lang in languages: try: scores[lang] /= summed_scores except ZeroDivisionError: scores[lang] = 0 return scores