Beispiel #1
0
def select_code_data():
    conn = mysql.connector.connect(user='******',
                                   password='******',
                                   database='codetag')  # , use_unicode=True
    cursor = conn.cursor(buffered=True)
    cursorInsert = conn.cursor()
    cursor.execute('select * from SampleC')
    numRows = int(cursor.rowcount)
    print numRows
    data_dict = dict()
    for i in range(numRows):
        row = cursor.fetchone()
        id = row[0]
        code = cc.code_clean(cc.remove_cpp_comment(row[1].encode('utf-8')))
        tags = str(row[2]).split('#')
        finalTag = ''

        for tag in tags:
            if (tag in tagSet):
                if (tag in data_dict):
                    data_dict[tag] += 1
                else:
                    data_dict[tag] = 1
                finalTag = tag

        if (finalTag != ''):
            sql = "insert into selectTag(Id,Code,Tags) values(%s,%s,%s)"
            data = [id, code, finalTag]
            # print data
            cursorInsert.execute(sql, data)
        print str(i) + "----" + str(numRows)
    for k, v in data_dict.iteritems():
        print k, v
    conn.commit()
    conn.close()
Beispiel #2
0
def prepare_csv():
    conn = mysql.connector.connect(user='******',
                                   password='******',
                                   database='codetag')  # , use_unicode=True
    cursor = conn.cursor(buffered=True)
    tagDict = create_dict()

    code_train = codecs.open(path + 'train.txt', 'w+', 'utf8')
    code_dev = codecs.open(path + 'dev.txt', 'w+', 'utf8')
    code_test = codecs.open(path + 'test.txt', 'w+', 'utf8')

    for t in tagSet:
        #cursor.execute('select * from selectTagType where ')
        cursor.execute(
            'select * from selectTagType where Tags = %s ORDER BY Id', (t, ))
        numRows = int(cursor.rowcount)
        print str(numRows) + '---------' + t
        for i in range(numRows):
            row = cursor.fetchone()
            id = row[0]
            code = cc.code_anonymous(
                cc.get_normalize_code(
                    cc.remove_non_ascii_1(row[1].encode('utf-8')).replace(
                        "\n", " "), 1000)).replace('\x00', '')
            #code = cc.get_normalize_code(cc.remove_non_ascii_1(row[1].encode('utf-8')).replace("\n", " "), 1000).replace("@","").replace('\x00', '')
            patternBlank = re.compile(' +')
            code = re.sub(patternBlank, " ", code).replace("@", "")
            type = cc.remove_dupliacte(
                cc.string_reverse(str(row[2]).replace('\n', '')))
            tag = str(tagDict.get(str(row[3]).replace('\n', '')))
            if (i < numRows * 0.7):
                code_train.write(tag + "@" + code + "@" + type + "\n")
            elif (i < numRows * 0.8):
                code_dev.write(tag + "@" + code + "@" + type + "\n")
            else:
                code_test.write(tag + "@" + code + "@" + type + "\n")
            #print str(id)+"---"+t

    code_train.close()
    code_dev.close()
    code_test.close()
Beispiel #3
0
def prepare_data():
    conn = mysql.connector.connect(user='******',
                                   password='******',
                                   database='codetag')  # , use_unicode=True
    cursor = conn.cursor(buffered=True)
    cursor.execute('select * from selectTagType')
    numRows = int(cursor.rowcount)

    code_train = codecs.open(path + 'trainCode.txt', 'w+', 'utf8')
    type_train = codecs.open(path + 'trainType.txt', 'w+', 'utf8')
    tag_train = codecs.open(path + 'trainTag.txt', 'w+', 'utf8')

    code_test = codecs.open(path + 'testCode.txt', 'w+', 'utf8')
    type_test = codecs.open(path + 'testType.txt', 'w+', 'utf8')
    tag_test = codecs.open(path + 'testTag.txt', 'w+', 'utf8')

    for i in range(numRows):
        row = cursor.fetchone()
        id = row[0]
        code = cc.code_anonymous(
            cc.get_normalize_code(
                cc.remove_non_ascii_1(row[1].encode('utf-8')).replace(
                    "\n", " "), 1000))
        patternBlank = re.compile(' +')
        code = re.sub(patternBlank, " ", code)
        type = str(row[2]).replace('\n', '')
        tag = str(row[3]).replace('\n', '')
        if (i < numRows * 0.8):
            code_train.write(code + '\n')
            type_train.write(type + '\n')
            tag_train.write(tag + '\n')
        else:
            code_test.write(code + '\n')
            type_test.write(type + '\n')
            tag_test.write(tag + '\n')
    code_train.close()
    type_train.close()
    tag_train.close()
    code_test.close()
    type_test.close()
    tag_test.close()
Beispiel #4
0
def prepare_csv():
    code_files = os.listdir(code_path)
    print len(code_files)

    ast_files = os.listdir(ast_path)
    print len(ast_files)
    ast_files.sort(key=lambda x: x[:-2])
    print ''

    code_train = codecs.open(write_to + 'train.txt', 'w+', 'utf8')
    code_dev = codecs.open(write_to + 'dev.txt', 'w+', 'utf8')
    code_test = codecs.open(write_to + 'test.txt', 'w+', 'utf8')

    for file in ast_files:
        print file
        type = open(ast_path + file, 'r').read()
        code = cc.remove_cpp_comment(open(code_path + file, 'r').read())

        #code = cc.code_anonymous(
        code = cc.get_normalize_code(
            cc.remove_non_ascii_1(code.encode('utf-8')).replace("\n", " "),
            1000).replace("@", "").replace('\x00', '')

        patternBlank = re.compile(' +')
        code = re.sub(patternBlank, " ", code).replace("@", "")
        type = cc.remove_dupliacte(cc.string_reverse(type.replace('\n', '')))

        tag = int(file.split(".")[0].split("-")[0]) - 1
        i = int(file.split(".")[0].split("-")[1])
        print i
        if (i < 500 * 0.7):
            code_train.write(str(tag) + "@" + code + "@" + type + "\n")
        elif (i < 500 * 0.8):
            code_dev.write(str(tag) + "@" + code + "@" + type + "\n")
        else:
            code_test.write(str(tag) + "@" + code + "@" + type + "\n")


#prepare_csv()
 def data_private(word_index, x):
     data = np.zeros((len(x), MAX_SENT_LENGTH), dtype='int32')
     for i, sentences in enumerate(x):
         wordTokens = cu._WORD_SPLIT.split(sentences)
         wordTokens = cu.remove_blank(wordTokens)
         k = 0
         for _, word in enumerate(wordTokens):
             if (k < MAX_SENT_LENGTH):
                 if (word not in word_index):
                     data[i, k] = word_index['<unknown>']
                 else:
                     data[i, k] = word_index[word]
             k = k + 1
     return data
def get_tokenizer(all_text, max_word, voca_path):
    texts = []
    for text in all_text:
        if (isinstance(text, basestring)):
            temp = cc.remove_blank(cc._WORD_SPLIT.split(text))
            texts.extend(temp)
    counts = Counter(texts)
    common_list = counts.most_common()
    common_list.sort(key=lambda x: x[1], reverse=True)
    sorted_voc = [wc[0] for wc in common_list]
    word_picked = ['<unknown>']
    word_picked.extend(sorted_voc)
    if (len(word_picked) > max_word):
        word_picked = word_picked[:max_word]
    word_index = dict()
    for word, index in zip(word_picked, range(max_word)):
        word_index[word] = index + 1
    save_obj(word_index, voca_path, 'voca')
    print "unknown word index is " + str(word_index.get('<unknown>'))
    print "Nuber of unique token is " + str(len(word_index))
    return word_index
Beispiel #7
0
def data_transfer(word_index, x, y):
    data = np.zeros((len(x), MAX_SENT_LENGTH), dtype='int32')
    for i, sentences in enumerate(x):
        wordTokens = cu._WORD_SPLIT.split(sentences)
        wordTokens = cu.remove_blank(wordTokens)
        k = 0
        for _, word in enumerate(wordTokens):
            if (k < MAX_SENT_LENGTH):
                if (word not in word_index):
                    data[i, k] = word_index['<unknown>']
                else:
                    data[i, k] = word_index[word]
            k = k + 1

    labels = to_categorical(np.asarray(y), num_classes=NUM_CLASS)
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    return data, labels