def select_code_data(): conn = mysql.connector.connect(user='******', password='******', database='codetag') # , use_unicode=True cursor = conn.cursor(buffered=True) cursorInsert = conn.cursor() cursor.execute('select * from SampleC') numRows = int(cursor.rowcount) print numRows data_dict = dict() for i in range(numRows): row = cursor.fetchone() id = row[0] code = cc.code_clean(cc.remove_cpp_comment(row[1].encode('utf-8'))) tags = str(row[2]).split('#') finalTag = '' for tag in tags: if (tag in tagSet): if (tag in data_dict): data_dict[tag] += 1 else: data_dict[tag] = 1 finalTag = tag if (finalTag != ''): sql = "insert into selectTag(Id,Code,Tags) values(%s,%s,%s)" data = [id, code, finalTag] # print data cursorInsert.execute(sql, data) print str(i) + "----" + str(numRows) for k, v in data_dict.iteritems(): print k, v conn.commit() conn.close()
def prepare_csv(): conn = mysql.connector.connect(user='******', password='******', database='codetag') # , use_unicode=True cursor = conn.cursor(buffered=True) tagDict = create_dict() code_train = codecs.open(path + 'train.txt', 'w+', 'utf8') code_dev = codecs.open(path + 'dev.txt', 'w+', 'utf8') code_test = codecs.open(path + 'test.txt', 'w+', 'utf8') for t in tagSet: #cursor.execute('select * from selectTagType where ') cursor.execute( 'select * from selectTagType where Tags = %s ORDER BY Id', (t, )) numRows = int(cursor.rowcount) print str(numRows) + '---------' + t for i in range(numRows): row = cursor.fetchone() id = row[0] code = cc.code_anonymous( cc.get_normalize_code( cc.remove_non_ascii_1(row[1].encode('utf-8')).replace( "\n", " "), 1000)).replace('\x00', '') #code = cc.get_normalize_code(cc.remove_non_ascii_1(row[1].encode('utf-8')).replace("\n", " "), 1000).replace("@","").replace('\x00', '') patternBlank = re.compile(' +') code = re.sub(patternBlank, " ", code).replace("@", "") type = cc.remove_dupliacte( cc.string_reverse(str(row[2]).replace('\n', ''))) tag = str(tagDict.get(str(row[3]).replace('\n', ''))) if (i < numRows * 0.7): code_train.write(tag + "@" + code + "@" + type + "\n") elif (i < numRows * 0.8): code_dev.write(tag + "@" + code + "@" + type + "\n") else: code_test.write(tag + "@" + code + "@" + type + "\n") #print str(id)+"---"+t code_train.close() code_dev.close() code_test.close()
def prepare_data(): conn = mysql.connector.connect(user='******', password='******', database='codetag') # , use_unicode=True cursor = conn.cursor(buffered=True) cursor.execute('select * from selectTagType') numRows = int(cursor.rowcount) code_train = codecs.open(path + 'trainCode.txt', 'w+', 'utf8') type_train = codecs.open(path + 'trainType.txt', 'w+', 'utf8') tag_train = codecs.open(path + 'trainTag.txt', 'w+', 'utf8') code_test = codecs.open(path + 'testCode.txt', 'w+', 'utf8') type_test = codecs.open(path + 'testType.txt', 'w+', 'utf8') tag_test = codecs.open(path + 'testTag.txt', 'w+', 'utf8') for i in range(numRows): row = cursor.fetchone() id = row[0] code = cc.code_anonymous( cc.get_normalize_code( cc.remove_non_ascii_1(row[1].encode('utf-8')).replace( "\n", " "), 1000)) patternBlank = re.compile(' +') code = re.sub(patternBlank, " ", code) type = str(row[2]).replace('\n', '') tag = str(row[3]).replace('\n', '') if (i < numRows * 0.8): code_train.write(code + '\n') type_train.write(type + '\n') tag_train.write(tag + '\n') else: code_test.write(code + '\n') type_test.write(type + '\n') tag_test.write(tag + '\n') code_train.close() type_train.close() tag_train.close() code_test.close() type_test.close() tag_test.close()
def prepare_csv(): code_files = os.listdir(code_path) print len(code_files) ast_files = os.listdir(ast_path) print len(ast_files) ast_files.sort(key=lambda x: x[:-2]) print '' code_train = codecs.open(write_to + 'train.txt', 'w+', 'utf8') code_dev = codecs.open(write_to + 'dev.txt', 'w+', 'utf8') code_test = codecs.open(write_to + 'test.txt', 'w+', 'utf8') for file in ast_files: print file type = open(ast_path + file, 'r').read() code = cc.remove_cpp_comment(open(code_path + file, 'r').read()) #code = cc.code_anonymous( code = cc.get_normalize_code( cc.remove_non_ascii_1(code.encode('utf-8')).replace("\n", " "), 1000).replace("@", "").replace('\x00', '') patternBlank = re.compile(' +') code = re.sub(patternBlank, " ", code).replace("@", "") type = cc.remove_dupliacte(cc.string_reverse(type.replace('\n', ''))) tag = int(file.split(".")[0].split("-")[0]) - 1 i = int(file.split(".")[0].split("-")[1]) print i if (i < 500 * 0.7): code_train.write(str(tag) + "@" + code + "@" + type + "\n") elif (i < 500 * 0.8): code_dev.write(str(tag) + "@" + code + "@" + type + "\n") else: code_test.write(str(tag) + "@" + code + "@" + type + "\n") #prepare_csv()
def data_private(word_index, x): data = np.zeros((len(x), MAX_SENT_LENGTH), dtype='int32') for i, sentences in enumerate(x): wordTokens = cu._WORD_SPLIT.split(sentences) wordTokens = cu.remove_blank(wordTokens) k = 0 for _, word in enumerate(wordTokens): if (k < MAX_SENT_LENGTH): if (word not in word_index): data[i, k] = word_index['<unknown>'] else: data[i, k] = word_index[word] k = k + 1 return data
def get_tokenizer(all_text, max_word, voca_path): texts = [] for text in all_text: if (isinstance(text, basestring)): temp = cc.remove_blank(cc._WORD_SPLIT.split(text)) texts.extend(temp) counts = Counter(texts) common_list = counts.most_common() common_list.sort(key=lambda x: x[1], reverse=True) sorted_voc = [wc[0] for wc in common_list] word_picked = ['<unknown>'] word_picked.extend(sorted_voc) if (len(word_picked) > max_word): word_picked = word_picked[:max_word] word_index = dict() for word, index in zip(word_picked, range(max_word)): word_index[word] = index + 1 save_obj(word_index, voca_path, 'voca') print "unknown word index is " + str(word_index.get('<unknown>')) print "Nuber of unique token is " + str(len(word_index)) return word_index
def data_transfer(word_index, x, y): data = np.zeros((len(x), MAX_SENT_LENGTH), dtype='int32') for i, sentences in enumerate(x): wordTokens = cu._WORD_SPLIT.split(sentences) wordTokens = cu.remove_blank(wordTokens) k = 0 for _, word in enumerate(wordTokens): if (k < MAX_SENT_LENGTH): if (word not in word_index): data[i, k] = word_index['<unknown>'] else: data[i, k] = word_index[word] k = k + 1 labels = to_categorical(np.asarray(y), num_classes=NUM_CLASS) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) indices = np.arange(data.shape[0]) np.random.shuffle(indices) data = data[indices] labels = labels[indices] return data, labels