def main(): # load the dataset datasetManager = DatasetManager() datasetManager.initialize('CNN').load() # counter = 0 code_archive = [] languages = [] for languageFolder in FileManager.getLanguagesFolders( FileManager.datasets['training']['url']): for exampleFolder in FileManager.getExamplesFolders( languageFolder.path): originalFileUrl = FileManager.getOriginalFileUrl( exampleFolder.path) originalFileContent = FileManager.readFile(originalFileUrl) # counter += 1 code_archive.append(originalFileContent) languages.append(str(languageFolder.name).lower()) # added - and @ max_fatures = 100000 embed_dim = 128 lstm_out = 64 batch_size = 32 epochs = 30 test_size = 0.001 tokenizer = Tokenizer(num_words=max_fatures) tokenizer.fit_on_texts(code_archive) dictionary = tokenizer.word_index FileManager.createFile( os.path.join(FileManager.getRootUrl(), 'tmp/wordindex.json'), json.dumps(dictionary)) X = tokenizer.texts_to_sequences(code_archive) X = pad_sequences(X, 100) Y = pd.get_dummies(languages) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size) # LSTM model model = Sequential() model.add(Embedding(max_fatures, embed_dim, input_length=100)) model.add( Conv1D(filters=128, kernel_size=3, padding='same', dilation_rate=1, activation='relu')) model.add(MaxPooling1D(pool_size=4)) model.add( Conv1D(filters=64, kernel_size=3, padding='same', dilation_rate=1, activation='relu')) model.add(MaxPooling1D(pool_size=2)) model.add(LSTM(lstm_out)) model.add(Dropout(0.5)) model.add(Dense(64)) model.add(Dense(len(Y.columns), activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size) model.save(os.path.join(FileManager.getRootUrl(), 'tmp/code_model.h5')) model.save_weights( os.path.join(FileManager.getRootUrl(), 'tmp/code_model_weights.h5')) score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size) print(model.metrics_names) print("Validation loss: %f" % score) print("Validation acc: %f" % acc)
def __cloneFilesSources(self): SOURCE_URL = FileManager.datasets['source']['url'] TRAINING_URL = FileManager.datasets['training']['url'] TESTING_URL = FileManager.datasets['testing']['url'] # foreach directory in '/Lang' folder ... languagesExamplesCounter = {} for languageFolder in [f for f in os.scandir(SOURCE_URL) if f.is_dir()]: language = str(languageFolder.name).lower() languagesExamplesCounter[language] = 0 # parse only selected languages if language in ConfigurationManager.getLanguages(): # preparing empty {languageFolder.name} for each dataset if not (os.path.isdir(os.path.join(TRAINING_URL, language))): os.mkdir(os.path.join(TRAINING_URL, language)) if not (os.path.isdir(os.path.join(TESTING_URL, language))): os.mkdir(os.path.join(TESTING_URL, language)) # count example foreach language for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): for _ in FileManager.getExampleFiles(exampleFolder.path): languagesExamplesCounter[language] += 1 # print languages with examples counter less than {TRAINING_EXAMPLES_NUMBER} if languagesExamplesCounter[language] < TRAINING_EXAMPLES_NUMBER: print(' > [dataset] the total number of examples for the ' + language + ' is less than ' + str(TRAINING_EXAMPLES_NUMBER)) continue # for this language, the total examples number could be less than {TRAINING_EXAMPLES_NUMBER} indexesOfTrainingExamples = random.sample( range(1, languagesExamplesCounter[language]), TRAINING_EXAMPLES_NUMBER ) # list all examples in {languageFolder.name} folder exampleIndex = 0 for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): # list all examples versions in {exampleFolder.name} folder for exampleVersionFile in FileManager.getExampleFiles(exampleFolder.path): exampleIndex += 1 # move file to right dataset if exampleIndex in indexesOfTrainingExamples: DATASET_TYPE = TRAINING_URL else: DATASET_TYPE = TESTING_URL # prepare destination folder example = str(exampleVersionFile.name).lower() exampleFolderUri = os.path.join(DATASET_TYPE, language, example) os.mkdir(exampleFolderUri) # copy the ORIGINAL source file content originalFileUri = FileManager.getOriginalFileUrl(exampleFolderUri) FileManager.createFile(originalFileUri) shutil.copyfile(exampleVersionFile.path, originalFileUri) # create the 'PARSED' version of the orginal file parsedFileUri = FileManager.getParsedFileUrl(exampleFolderUri) FileManager.createFile(parsedFileUri) parser = Parser() parser.initialize(originalFileUri, parsedFileUri) parser.parse() return self