def __init__(self, num_hidden_layers=1):
        ClassificationModule.__init__(self, "Meta Only NN", "A feedforward neural network which uses metadata (numerical features + used programming languages)")

        # Set input-size and output_size
        self.input_size = getMetadataLength()
        self.output_size = 7 # Hardcoded for 7 classes

        # Create model
        model = Sequential()
        # Add input-layer
        model.add(Dense(self.input_size, input_dim=self.input_size, init='uniform'))
        model.add(Activation('relu'))

        # Add hidden layers
        for _ in xrange(num_hidden_layers):
            model.add(Dense(self.input_size, init='uniform'))
            model.add(Activation('relu'))
        
        # Add output layer and normalize probablities with softmax
        model.add(Dense(self.output_size, init='uniform'))
        model.add(Activation('softmax'))

        # Compile model and use Adam as optimizer
        model.compile(metrics=['accuracy'], loss='categorical_crossentropy', optimizer=Adam())

        self.model = model
        print "\t-", self.name
    def __init__(self, num_hidden_layers=3):
        ClassificationModule.__init__(self, "Foldernames Only LSTM", "A LSTM reading the foldernames character by character")

        hidden_size = 250
        self.maxlen = 100

        # Set output_size
        self.output_size = 7 # Hardcoded for 7 classes

        model = Sequential()

        # Maximum of self.maxlen charcters allowed, each in one-hot-encoded array
        model.add(LSTM(hidden_size, input_shape=(self.maxlen, getLstmCharLength())))

        for _ in range(num_hidden_layers):
            model.add(Dense(hidden_size))
            #model.add(LSTM(hidden_size)) Alternativ

        model.add(Dense(self.output_size))  
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy',
                    optimizer=SGD(),
                    metrics=['accuracy'])

        self.model = model
        print "\t-", self.name
Beispiel #3
0
    def __init__(self, num_hidden_layers=3):
        ClassificationModule.__init__(
            self, "Repo-Name Only LSTM",
            "A LSTM reading the repository-name character by character.\
        The input is a matrix, each row standing for a character with each column representing a lowercase ASCII-Character, punctuation or a number.\
        Up to 30 characters are fed into this net.\
        The first hidden layer is the LSTM-layer, the next 3 are standard and fully connected. The outputs are normalized using a softmax function.\
        The loss function is categorical crossentropy and the learning-rate of the used Adam-optimizer was set to 0.0025."
        )

        hidden_size = 250
        self.maxlen = 30

        # Set output_size
        self.output_size = 7  # Hardcoded for 7 classes

        model = Sequential()

        # Maximum of self.maxlen charcters allowed, each in one-hot-encoded array
        model.add(
            LSTM(hidden_size, input_shape=(self.maxlen, getLstmCharLength())))

        for _ in range(num_hidden_layers):
            model.add(Dense(hidden_size))

        model.add(Dense(self.output_size))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(lr=0.0025),
                      metrics=['accuracy'])

        self.model = model
        print "\t-", self.name
    def __init__(self, n_estimators=150):
        description = "Gradient Tree Boosting / Gradient Boosted Regression Trees (GBRT)."

        ClassificationModule.__init__(self, "Meta Only Gradient Tree Boosting", description)

        self.clf = GradientBoostingClassifier(n_estimators=n_estimators)
        print "\t-", self.name
    def __init__(self, n_estimators=250):
        ClassificationModule.__init__(
            self, "Meta Only Random Forest",
            "Ensemble Learner with 250 Decision-Trees as base-classifier. Uses only our metadata."
        )

        self.clf = RandomForestClassifier(n_estimators=n_estimators,
                                          class_weight='balanced')

        print "\t-", self.name
    def __init__(self, text_corpus):
        ClassificationModule.__init__(
            self, "Readme Only KNN",
            "A K-Nearest Neighbor Classifier trained and used on Readme-Vec")
        # Create vectorizer and fit on all available Descriptions
        self.vectorizer = getTextVectorizer(
            10000)  # Maximum of different columns
        corpus = []
        for description in text_corpus:
            corpus.append(process_text(description))
        self.vectorizer.fit(corpus)

        self.clf = KNeighborsClassifier(n_neighbors=10, algorithm='auto')
        print "\t-", self.name
Beispiel #7
0
    def __init__(self, text_corpus):
        ClassificationModule.__init__(self,
                                      "Readme Only Bernoulli Naive Bayes",
                                      "A Bernoulli Naive Bayes-Classifier")
        # Create vectorizer and fit on all available Descriptions
        self.vectorizer = getTextVectorizer(
            9000)  # Maximum of different columns
        corpus = []
        for description in text_corpus:
            corpus.append(process_text(description))
        self.vectorizer.fit(corpus)

        self.clf = BernoulliNB()
        print "\t-", self.name
Beispiel #8
0
    def __init__(self,
                 text_corpus,
                 filetype_corpus,
                 filename_corpus,
                 foldername_corpus,
                 n_estimators=200):

        description = "This is a Random Forest with 200 Decision Trees as base classifiers.\
        It has access to all available features: readme, description, metadata, filetypes/-names and foldernames.\
        The readme and description are both encoded by the same Tfidf-Vectorizer with a vocabulary of 6000 words.\
        Also the filetypes are encoded by such vectorizer, allowing encoding of 30 distinct filetypes.\
        The vectorizer for foldernames and filenames distinguishes 100 and 200 different names."

        ClassificationModule.__init__(self, "All Random Forest", description)

        self.vectorizer = getTextVectorizer(
            6000)  # Maximum of different columns
        self.filetypeVectorizer = getTextVectorizer(30)
        self.foldernameVectorizer = getTextVectorizer(100)
        self.filenameVectorizer = getTextVectorizer(200)

        # Vectorizer for descriptions and/or readmes
        corpus = []
        for text in text_corpus:
            corpus.append(process_text(text))
        self.vectorizer.fit(corpus)

        # Vectorizer for filetypes
        corpus = []
        for type in filetype_corpus:
            corpus.append(type)
        self.filetypeVectorizer.fit(corpus)

        # Vectorizer for filenames
        corpus = []
        for type in filename_corpus:
            corpus.append(type)
        self.filenameVectorizer.fit(corpus)

        # Vectorizer for foldernames
        corpus = []
        for folder in foldername_corpus:
            corpus.append(folder)
        self.foldernameVectorizer.fit(corpus)

        self.clf = RandomForestClassifier(n_estimators=n_estimators,
                                          class_weight='balanced')

        print "\t-", self.name
    def __init__(self, text_corpus, filetype_corpus, filename_corpus,
                 foldername_corpus):
        my_description = "This Support Vector Machine has access to the readme, description, all metadata, filenames, filetypes and foldernames. \
        The C and gamma for the non-linear rbf-kernel were set to 2000.0 and 0.01.\
        The readme and description are both encoded by the same Tfidf-Vectorizer with a vocabulary of 7000 words.\
        Also the filetypes are encoded by such vectorizer, allowing encoding of 30 distinct filetypes.\
        The vectorizer for foldernames and filenames both distinguish 150 different words."

        ClassificationModule.__init__(self, "ALL Support Vector Classifier",
                                      my_description)

        self.vectorizer = getTextVectorizer(
            7000)  # Maximum of different columns
        self.filetypeVectorizer = getTextVectorizer(30)
        self.foldernameVectorizer = getTextVectorizer(150)
        self.filenameVectorizer = getTextVectorizer(150)

        # Vectorizer for descriptions and/or readmes
        corpus = []
        for text in text_corpus:
            corpus.append(process_text(text))
        self.vectorizer.fit(corpus)

        # Vectorizer for filetypes
        corpus = []
        for type in filetype_corpus:
            corpus.append(type)
        self.filetypeVectorizer.fit(corpus)

        # Vectorizer for filenames
        corpus = []
        for type in filename_corpus:
            corpus.append(type)
        self.filenameVectorizer.fit(corpus)

        # Vectorizer for foldernames
        corpus = []
        for folder in foldername_corpus:
            corpus.append(folder)
        self.foldernameVectorizer.fit(corpus)

        # Create classifier
        self.clf = SVC(C=2000.0,
                       class_weight="balanced",
                       gamma=0.01,
                       probability=True)

        print "\t-", self.name
    def __init__(self, text_corpus):
        ClassificationModule.__init__(self, "Readme Only Logistic Regressor",
                                      "A Logistic Regressor")
        # Create vectorizer and fit on all available Descriptions
        self.vectorizer = getTextVectorizer(
            9000)  # Maximum of different columns
        corpus = []
        for readme in text_corpus:
            corpus.append(process_text(readme))
        self.vectorizer.fit(corpus)

        self.clf = LogisticRegression(multi_class='multinomial',
                                      solver='lbfgs',
                                      class_weight='auto')

        print "\t-", self.name
Beispiel #11
0
    def __init__(self, text_corpus, n_estimators=150):
        description = "Gradient Tree Boosting / Gradient Boosted Regression Trees (GBRT)."

        ClassificationModule.__init__(self,
                                      "Readme Only Gradient Tree Boosting",
                                      description)

        # Create vectorizer and fit on all available Descriptions
        self.vectorizer = getTextVectorizer(
            5000)  # Maximum of different columns
        corpus = []
        for description in text_corpus:
            corpus.append(process_text(description))
        self.vectorizer.fit(corpus)

        self.clf = GradientBoostingClassifier(n_estimators=n_estimators)
        print "\t-", self.name
Beispiel #12
0
    def __init__(self, text_corpus):
        ClassificationModule.__init__(
            self, "Readme Only Random Forest",
            "Ensemble Learner with 200 Decision-Trees as base-classifiers.\
        The text is encoded by a Tfidf-Vectorizer, containing a vocabulary of 5000 distinct words."
        )

        # Create vectorizer and fit on all available Descriptions
        self.vectorizer = getTextVectorizer(
            5000)  # Maximum of different columns
        corpus = []
        for description in text_corpus:
            corpus.append(process_text(description))
        self.vectorizer.fit(corpus)

        self.clf = RandomForestClassifier(n_estimators=200)

        print "\t-", self.name
Beispiel #13
0
    def __init__(self, text_corpus, n_estimators=150):
        description = "Here 150 base classifiers are being used to predict \
        the class based on metadata as well as the short-description which gets encoded by a Tfidf-Vectorizer (Vocabulary Size: 6000)."

        ClassificationModule.__init__(
            self, "Description and Metadata Gradient Tree Boosting",
            description)

        # Create vectorizer and fit on all available Descriptions
        self.vectorizer = getTextVectorizer(
            6000)  # Maximum of different columns/ words
        corpus = []
        for description in text_corpus:
            corpus.append(process_text(description))
        self.vectorizer.fit(corpus)

        self.clf = GradientBoostingClassifier(n_estimators=n_estimators)
        print "\t-", self.name
Beispiel #14
0
    def __init__(self, text_corpus, filetype_corpus, filename_corpus,
                 foldername_corpus):
        ClassificationModule.__init__(
            self, "All Bernoulli NB",
            "A Bernoulli Naive Bayes-Classifier used with all data")

        # Create vectorizer and fit on all available Corpi
        self.vectorizer = getTextVectorizer(
            6000)  # Maximum of different columns
        self.filetypeVectorizer = getTextVectorizer(30)
        self.foldernameVectorizer = getTextVectorizer(100)
        self.filenameVectorizer = getTextVectorizer(200)

        # Vectorizer for descriptions and/or readmes
        corpus = []
        for text in text_corpus:
            corpus.append(process_text(text))
        self.vectorizer.fit(corpus)

        # Vectorizer for filetypes
        corpus = []
        for type in filetype_corpus:
            corpus.append(type)
        self.filetypeVectorizer.fit(corpus)

        # Vectorizer for filenames
        corpus = []
        for type in filename_corpus:
            corpus.append(type)
        self.filenameVectorizer.fit(corpus)

        # Vectorizer for foldernames
        corpus = []
        for folder in foldername_corpus:
            corpus.append(folder)
        self.foldernameVectorizer.fit(corpus)

        # Create classifier
        self.clf = BernoulliNB()

        print "\t-", self.name
    def __init__(self, text_corpus):
        my_description = "This Support Vector Machine is trained on the readme, encoded with an TfIdf-Vectorizer, and metadata. \
                          This vectorizer has a vocabulary of 6000 distinct words."

        ClassificationModule.__init__(
            self, "Readme and Meta Support Vector Classifier", my_description)

        # Create vectorizer and fit on all available Corpi
        self.vectorizer = getTextVectorizer(
            6000)  # Maximum of different columns

        # Vectorizer for descriptions and/or readmes
        corpus = []
        for text in text_corpus:
            corpus.append(process_text(text))
        self.vectorizer.fit(corpus)

        # Create classifier
        self.clf = SVC(C=1000.0, class_weight='balanced', probability=True)

        print "\t-", self.name
Beispiel #16
0
    def __init__(self, text_corpus, num_hidden_layers=1):
        ClassificationModule.__init__(
            self, "Readme Only NN",
            "A basic feedforward neural network with 2 hidden layers trained on readme (Tfidf-Vectorizer dimension = 6000)."
        )
        # Create vectorizer and fit on all available Descriptions
        self.vectorizer = getTextVectorizer(
            6000)  # Maximum of different columns
        corpus = []
        for description in text_corpus:
            corpus.append(process_text(description))
        self.vectorizer.fit(corpus)

        # Set input-size and output_size
        self.input_size = len(self.vectorizer.get_feature_names())
        self.output_size = 7  # Hardcoded for 7 classes

        # Create model
        model = Sequential()
        # Add input-layer
        model.add(
            Dense(self.input_size, input_dim=self.input_size, init='uniform'))
        model.add(Activation('relu'))

        # Add hidden layers
        for _ in xrange(num_hidden_layers):
            model.add(Dense(self.input_size, init='uniform'))
            model.add(Activation('relu'))

        # Add output layer and normalize probablities with softmax
        model.add(Dense(self.output_size, init='uniform'))
        model.add(Activation('softmax'))

        # Compile model and use Adam as optimizer
        model.compile(metrics=['accuracy'],
                      loss='categorical_crossentropy',
                      optimizer=Adam())

        self.model = model
        print "\t-", self.name
Beispiel #17
0
    def __init__(self, file_corpus, foldername_corpus, n_estimators=150):
        description = "Gradient Tree Boosting / Gradient Boosted Regression Trees (GBRT)."

        ClassificationModule.__init__(self, "Files and Folders Gradient Tree Boosting", description)

        self.fileVectorizer = getTextVectorizer(50) 
        self.foldernameVectorizer = getTextVectorizer(50) 

        # Vectorizer for filetypes
        corpus = []
        for type in file_corpus:
            corpus.append(type)
        self.fileVectorizer.fit(corpus)

        # Vectorizer for foldernames
        corpus = []
        for folder in foldername_corpus:
            corpus.append(folder)
        self.foldernameVectorizer.fit(corpus)

        self.clf = GradientBoostingClassifier(n_estimators=n_estimators)
        print "\t-", self.name
    def __init__(self, num_hidden_layers=3):
        ClassificationModule.__init__(
            self, "Readme Only Word2Vec LSTM",
            "A LSTM-Network reading the Readme word by word.\
        We used a Word2Vec-Model trained on Google-News articles, providing an embedding of 3 million different words.\
        This embedding comprises 300 dimensions. Each word is then fed into an LSTM-layer being followed by 3 Dense-layers.\
        Optimizer: Adam, loss: categorical crossentropy.")

        hidden_size = 300
        self.maxlen = 1000

        print "\tLoading word2vec Model"
        path = os.path.dirname(__file__) + "/../../Word2VecModel/"
        modelName = 'GoogleNews-vectors-negative300.bin'
        self.word2vecModel = Word2Vec.load_word2vec_format(path + modelName,
                                                           binary=True)

        # Set output_size
        self.output_size = 7  # Hardcoded for 7 classes

        model = Sequential()

        # Maximum of self.maxlen charcters allowed, each in one-hot-encoded array
        model.add(
            LSTM(hidden_size,
                 input_shape=(self.maxlen, self.word2vecModel.vector_size)))

        for _ in range(num_hidden_layers):
            model.add(LSTM(hidden_size))

        model.add(Dense(self.output_size))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])

        self.model = model
        print "\t-", self.name
Beispiel #19
0
    def __init__(self,
                 text_corpus,
                 filetype_corpus,
                 filename_corpus,
                 foldername_corpus,
                 num_hidden_layers=1):
        ClassificationModule.__init__(
            self, "All NN",
            "A basic feedforward neural network with 3 hidden layers.\
        The used activation function is LeakyReLU. Trained with Adam-optimizer.\
        Features are metadata and all text-features except the repository-name.\
        The readme and description are both encoded by the same Tfidf-Vectorizer with a vocabulary of 7000 words.\
        Also the filetypes are encoded by such vectorizer, allowing encoding of 30 distinct filetypes.\
        The vectorizer for foldernames and filenames both distinguish 150 different words."
        )

        self.vectorizer = getTextVectorizer(
            7000)  # Maximum of different columns
        self.filetypeVectorizer = getTextVectorizer(30)
        self.foldernameVectorizer = getTextVectorizer(150)
        self.filenameVectorizer = getTextVectorizer(150)

        # Vectorizer for descriptions and/or readmes
        corpus = []
        for text in text_corpus:
            corpus.append(process_text(text))
        self.vectorizer.fit(corpus)

        # Vectorizer for filetypes
        corpus = []
        for type in filetype_corpus:
            corpus.append(type)
        self.filetypeVectorizer.fit(corpus)

        # Vectorizer for filenames
        corpus = []
        for type in filename_corpus:
            corpus.append(type)
        self.filenameVectorizer.fit(corpus)

        # Vectorizer for foldernames
        corpus = []
        for folder in foldername_corpus:
            corpus.append(folder)
        self.foldernameVectorizer.fit(corpus)

        # Set input-size and output_size
        self.input_size = len(
            self.vectorizer.get_feature_names()) + getMetadataLength() + len(
                self.filetypeVectorizer.get_feature_names()) + len(
                    self.foldernameVectorizer.get_feature_names()) + len(
                        self.filenameVectorizer.get_feature_names())
        self.output_size = 7  # Hardcoded for 7 classes

        # Create model
        model = Sequential()
        # Add input-layer
        model.add(
            Dense(self.input_size, input_dim=self.input_size, init='uniform'))
        model.add(LeakyReLU())

        # Add hidden layers
        for _ in xrange(num_hidden_layers):
            model.add(Dense(self.input_size, init='uniform'))
            model.add(LeakyReLU())

        # Add output layer and normalize probablities with softmax
        model.add(Dense(self.output_size, init='uniform'))
        model.add(Activation('softmax'))

        # Compile model and use Adam as optimizer
        model.compile(metrics=['accuracy'],
                      loss='categorical_crossentropy',
                      optimizer=Adam())

        self.model = model
        print "\t-", self.name
 def __init__(self):
     ClassificationModule.__init__(self, "Meta Only AdaBoostClassifier (Decision-Trees)", 'AdaBoostClassifier (base estimator: Decision-Trees) used with Meta-Data (Programming-Languages, stars, watches, ...)')
     self.clf = AdaBoostClassifier()
     print "\t-", self.name