def __prepareFeatures(self, dataset: str): # find or create the features file self.__calculateTokensEntropyLoss(dataset) # get features file languageFeatures = json.loads( FileManager.readFile(FileManager.getFeaturesFileUrl(self.type))) X = [] Y = [] sources, languages = self.extractSources(dataset) for idx, source in enumerate(sources): language = languages[idx] features = [] tokens = set(source.split(' ')) # X for _lang in languageFeatures: for _tk in languageFeatures[_lang]: if _tk in tokens: features.append(1) else: features.append(0) X.append(features) # Y Y.append(language) return X, Y
def __loadInMemory(self): TRAINING_URL = FileManager.datasets['training']['url'] TESTING_URL = FileManager.datasets['testing']['url'] # training for languageFolder in FileManager.getLanguagesFolders(TRAINING_URL): language = str(languageFolder.name).lower() self.Dataset.addLanguage('training', language) # example for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): exampleDict: dict = {} # original file originalFileUri = FileManager.getOriginalFileUrl(exampleFolder.path) originalFileContent = FileManager.readFile(originalFileUri) exampleDict['original'] = originalFileContent # parsed file parsedFileUri = FileManager.getParsedFileUrl(exampleFolder.path) parsedFileContent = FileManager.readFile(parsedFileUri) exampleDict['parsed'] = parsedFileContent # save self.Dataset.addExample('training', language, exampleDict) # testing for languageFolder in FileManager.getLanguagesFolders(TESTING_URL): language = str(languageFolder.name).lower() self.Dataset.addLanguage('testing', language) # example for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): exampleDict: dict = {} # original file originalFileUri = FileManager.getOriginalFileUrl(exampleFolder.path) originalFileContent = FileManager.readFile(originalFileUri) exampleDict['original'] = originalFileContent # parsed file parsedFileUri = FileManager.getParsedFileUrl(exampleFolder.path) parsedFileContent = FileManager.readFile(parsedFileUri) exampleDict['parsed'] = parsedFileContent # save self.Dataset.addExample('testing', language, exampleDict) return self
def load(self): datasetAlreadyExists = self.__create_folders() # clone file sources if dataset doesn't already exists if not datasetAlreadyExists: self.__cloneFilesSources() if not os.path.exists(FileManager.getDatasetCopyFileUrl()): # load dataset in memory self.__loadInMemory() # generate 'filtered' version self.__filterSources() # save dataset copy datasetCopy: dict = {'training': self.Dataset.training, 'testing': self.Dataset.testing} FileManager.writeFile(FileManager.getDatasetCopyFileUrl(), json.dumps(datasetCopy)) else: datasetCopy = json.loads(FileManager.readFile(FileManager.getDatasetCopyFileUrl())) self.Dataset.training = datasetCopy['training'] self.Dataset.testing = datasetCopy['testing'] return self
def main(): data = {"success": False} languages = ConfigurationManager.getLanguages() matched = 0 totalExamples = 0 for languageFolder in FileManager.getLanguagesFolders(FileManager.datasets['testing']['url']): language = str(languageFolder.name).lower() for exampleFolder in FileManager.getExamplesFolders(languageFolder.path): totalExamples += 1 X_test = [] originalFileContent = FileManager.readFile(FileManager.getOriginalFileUrl(exampleFolder.path)) code_snip = originalFileContent # print(code_snip, file=sys.stdout) word_vec = convert_text_to_index_array(code_snip) X_test.append(word_vec) X_test = pad_sequences(X_test, maxlen=100) # print(X_test[0].reshape(1,X_test.shape[1]), file=sys.stdout) y_prob = model.predict(X_test[0].reshape(1, X_test.shape[1]), batch_size=1, verbose=2)[0] a = np.array(y_prob) idx = np.argmax(a) if str(languages[idx]) == language: matched += 1 # data["predictions"] = [] # for i in range(len(languages)): # # print(languages[i], file=sys.stdout) # r = {"label": languages[i], "probability": format(y_prob[i] * 100, '.2f')} # data["predictions"].append(r) print('') print('') print('totalExamples = ' + str(totalExamples)) print('matched = ' + str(matched)) print('matched / totalExamples = ' + str(matched / totalExamples)) print('') print('')
def main(): # load the dataset datasetManager = DatasetManager() datasetManager.initialize('CNN').load() # counter = 0 code_archive = [] languages = [] for languageFolder in FileManager.getLanguagesFolders( FileManager.datasets['training']['url']): for exampleFolder in FileManager.getExamplesFolders( languageFolder.path): originalFileUrl = FileManager.getOriginalFileUrl( exampleFolder.path) originalFileContent = FileManager.readFile(originalFileUrl) # counter += 1 code_archive.append(originalFileContent) languages.append(str(languageFolder.name).lower()) # added - and @ max_fatures = 100000 embed_dim = 128 lstm_out = 64 batch_size = 32 epochs = 30 test_size = 0.001 tokenizer = Tokenizer(num_words=max_fatures) tokenizer.fit_on_texts(code_archive) dictionary = tokenizer.word_index FileManager.createFile( os.path.join(FileManager.getRootUrl(), 'tmp/wordindex.json'), json.dumps(dictionary)) X = tokenizer.texts_to_sequences(code_archive) X = pad_sequences(X, 100) Y = pd.get_dummies(languages) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size) # LSTM model model = Sequential() model.add(Embedding(max_fatures, embed_dim, input_length=100)) model.add( Conv1D(filters=128, kernel_size=3, padding='same', dilation_rate=1, activation='relu')) model.add(MaxPooling1D(pool_size=4)) model.add( Conv1D(filters=64, kernel_size=3, padding='same', dilation_rate=1, activation='relu')) model.add(MaxPooling1D(pool_size=2)) model.add(LSTM(lstm_out)) model.add(Dropout(0.5)) model.add(Dense(64)) model.add(Dense(len(Y.columns), activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size) model.save(os.path.join(FileManager.getRootUrl(), 'tmp/code_model.h5')) model.save_weights( os.path.join(FileManager.getRootUrl(), 'tmp/code_model_weights.h5')) score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size) print(model.metrics_names) print("Validation loss: %f" % score) print("Validation acc: %f" % acc)
import keras.preprocessing.text as kpt from keras.preprocessing.sequence import pad_sequences import sys import os import json import numpy as np from utils import ConfigurationManager, FileManager ## global dictionary global model dictionaryUrl = os.path.join(FileManager.getRootUrl(), 'tmp/wordindex.json') dictionary = json.loads(FileManager.readFile(dictionaryUrl)) modelUrl = os.path.join(FileManager.getRootUrl(), 'tmp/code_model.h5') model = load_model(modelUrl) def convert_text_to_index_array(text): # one really important thing that `text_to_word_sequence` does # is make all texts the same length -- in this case, the length # of the longest text in the set. wordvec = [] for word in kpt.text_to_word_sequence(text): if word in dictionary: if dictionary[word] <= 100000: wordvec.append([dictionary[word]])
def importVocabulary(self): return json.loads(FileManager.readFile(FileManager.getVocabularyFileUrl(self.type)))