Exemple #1
0
    def testNormalization(self):
        print("** Test normalization **")

        the_normalizer = Normalizer("datasets/test_normalization.csv")

        normalized = [[0, 0, 0], [1, 1, 1], [0.5, 0.1, 0.9]]

        self.assertTrue(the_normalizer.normalize() == normalized,
                        "Normalized data doesn't match")
def detectCircle(im):
    # detect circles in the image
    n = Normalizer(170)

    im = n.crop(im)
    new = imutils.resize(im, height=170)
    if new.shape[1] > 170:
        new = imutils.resize(im, width=170)

    circles = cv2.HoughCircles(new, cv2.HOUGH_GRADIENT, 1.5, minDist=170, param2=30, minRadius=70, maxRadius=85)

    return not circles is None
Exemple #3
0
    def testCSVIntegrity(self):
        print("** Test CSV Integrity **")

        the_normalizer = Normalizer("datasets/test_normalization.csv")

        data = the_normalizer.get_csv()

        origin_data = [['0', '3', '0'], ['1', '33', '100'], ['0.5', '6', '90']]

        length = 3

        self.assertTrue(data == origin_data, "Data and CSV file doesn't match")
        self.assertTrue(length == the_normalizer.getRowLength(),
                        "Line length doesn't match")
Exemple #4
0
    def __init__(self, k, n, columns, datafile):
        """Constructeur pour la classe KMeanClusterer"""
        super(KMeanClusterer, self).__init__()

        # Number of clusters wanted
        self.k = k
        self.n = n

        self.is_over = False

        # columns to work with
        self.columns = sorted(columns)

        # Get CSV data
        norm = Normalizer(datafile)
        self.data = norm.normalize()
        self.row_length = norm.getRowLength()
        self.clusters = []
Exemple #5
0
 def __init__(self, k, n, columns, datafile):
     """Constructeur pour la classe KMeanClusterer"""
     super(KMeanClusterer, self).__init__()
     
     # Number of clusters wanted
     self.k = k
     self.n = n
     
     self.is_over = False
     
     # columns to work with
     self.columns = sorted(columns)
     
     # Get CSV data
     norm = Normalizer(datafile)
     self.data = norm.normalize()
     self.row_length = norm.getRowLength()
     self.clusters = []
def brain(command):
    response = ""
    command = command
    # from 0  =>> 15 is verb for search and find
    # from 16 =>> 21 is verb for open
    actions = [
        "search", "find", "view", "reach", "detect", "get", "catch", "explore",
        "achieve", "obtain", "pass", "check", "reveal", "expose", "observe",
        "show", "see", "listen", "hear", "open", "watch", "arise", "awaken",
        "call", "consciousness", "get up", "stir", "wake", "wake up"
    ]

    tokens = Tokenizer().tokenize(command)

    # call weather function if there is weather word and country or city name
    citiesORcountries = weatherFunction(command)
    if 'weather' in command.split() and citiesORcountries != []:
        return 'the weather in ' + citiesORcountries[0] + ' is ' + WeatherC(
        ).weatherForecast(citiesORcountries[0]) + ' today'

    action = None

    fileName = None
    # -----------------------------------<<Variable>>--------------------------------------------
    tagSentence = Tagger().tag(tokens)

    for counter in range(len(tagSentence)):
        # if tagSentence[counter][1] == 'VB' or tagSentence[counter][0] in self.actions:

        if tagSentence[counter][0] in actions:

            action = tagSentence[counter][0]

        elif tagSentence[counter][1] == 'NN':
            fileName = tagSentence[counter][0]

    normlizeAction = Normalizer().snowBallStemmer(action)

    if normlizeAction in actions:
        filePath = FileSearch().search(
            fileName)  # return list of file shared the same name

        if normlizeAction in actions[:15]:
            # for search about folder or file
            OpenMedia().openFile(filePath[0].split("//")[0])
            response = "i hope you're satisfied with our service"
            return response

        if normlizeAction in actions[15:21]:
            #if he
            if normlizeAction in [
                    'listen', 'hear', 'watch'
            ] and filePath[0].split('.')[1] != ['mp3', 'mp4', 'mkv']:

                pass
            OpenMedia().openFile(filePath[0])
def loading_dataSet():
    file = open("res/dataset.txt", "r")
    data = file.read()
    file.close()
    docs = data.split("\n")
    types = []
    train = []
    for d in docs:
        d = d.split()
        if len(d) != 0:
            types.append(d[0])
    print('dataset Count = ' + str(len(types)))
    normalized_corpus = Normalizer.normalize_corpus(docs)
    normalized_corpus.remove('')
    counter = 0
    for x in normalized_corpus:
        train.append((x, types[counter]))
        counter = counter + 1
    return train
def classify_btn_clicked():
    def setClassification(type):
        if type == '1':
            classi_out.setPlainText('culture')
        elif type == '2':
            classi_out.setPlainText('sport')
        elif type == '3':
            classi_out.setPlainText('economy')
        elif type == '4':
            classi_out.setPlainText('international')
        elif type == '5':
            classi_out.setPlainText('local')
        elif type == '6':
            classi_out.setPlainText('religion')

    tester_doc = file_.toPlainText().strip()
    normalized_tester_doc = Normalizer.normalize_corpus([tester_doc])
    featuresets_test = [features(words) for words in normalized_tester_doc]
    predicted_label = classifier.classify_many(featuresets_test)
    setClassification(predicted_label[0])
    def brn(self):

        tagSentence = Tagger().tag(self.tokens)

        for counter in range(len(tagSentence)):
            # if tagSentence[counter][1] == 'VB' or tagSentence[counter][0] in self.actions:

            if tagSentence[counter][0] in self.actions:

                action = tagSentence[counter][0]


            elif tagSentence[counter][1] == 'NN':
                fileName = tagSentence[counter][0]

        normlizeAction = Normalizer().snowBallStemmer(action)



        if normlizeAction in self.actions:
            filePath = FileSearch().search(fileName)  # return list of file shared the same name

            if normlizeAction in self.actions[:15]:
                # for search about folder or file
                OpenMedia().openFile(filePath[0].split("//")[0])

            if normlizeAction in self.actions[15:21]:

                OpenMedia().openFile(filePath[0])




        else:
            pass
            # return "can you explain more"

# Brain("i wanna open workout").brn()
Exemple #10
0
    def getDatasetSize(self, datafile):

        norm = Normalizer()
        iris_data_matrix = norm.load_csv(datafile)
        return len(iris_data_matrix)
 def __init__(self):
     # Text normalizer
     self.normalizer = Normalizer()
class FeatureExtractor:
    """
    Contains methods for corpus normalization.
    """
    def __init__(self):
        # Text normalizer
        self.normalizer = Normalizer()

    def bag_of_words(self, corpus, ngram_range=(1, 1), type_="binary"):
        """
        Generate bag of words for each document of a corpus.
        
        Args:
            corpus (list of str): List of documents
            ngram_range (tuple of int): Minimum and maximum size of ngrams in text
                                        used only if type is *-ngram
            ngram_size (int): Size of a ngram
            tupe (int): Type of bag of words:
                            - binary
                            - frequency
                            - tfidf
                            - binary-ngram
                            - frequency-ngram
                            - tfidf-ngram
            
            Returns:
                list of str/tuple of str:int pairs: Bag of words/ngrams
        """
        corpus = [
            self.normalizer.normalize_text(document) for document in corpus
        ]

        if type_ == "binary":
            bag_of_words = feature_extraction.bag_of_words_binary_corpus(
                corpus)
        elif type_ == "frequency":
            bag_of_words = feature_extraction.bag_of_words_frequencies_corpus(
                corpus)
        elif type_ == "tfidf":
            bag_of_words = feature_extraction.bag_of_words_tfidf_corpus(corpus)
        elif type_ == "binary-ngram":
            bag_of_words = feature_extraction.bag_of_ngrams_binary_corpus(
                corpus, ngram_range[0])
        elif type_ == "frequency-ngram":
            bag_of_words = feature_extraction.bag_of_ngrams_frequencies_range_corpus(
                corpus, ngram_range)
        elif type_ == "tfidf-ngram":
            bag_of_words = feature_extraction.bag_of_ngrams_tfidf_range_corpus(
                corpus, ngram_range)
        else:
            raise ValueError(
                """Wrong type_ input. Type help(bag_of_words) to see supported types."""
            )

        return bag_of_words

    def feature_matrix(self, corpus, ngram_range=(1, 1), type_="binary"):
        """
        Generate feature matrix for each document of a corpus.
        
        Args:
            corpus (list of str): List of documents
            ngram_range (tuple of int): Minimum and maximum size of ngrams in text
                                        used only if type is *-ngram
            ngram_size (int): Size of a ngram
            tupe (int): Type of bag of words:
                            - binary
                            - frequency
                            - tfidf
                            - binary-ngram
                            - frequency-ngram
                            - tfidf-ngram
            
            Returns:
                ###
        """
        bag_of_words = self.bag_of_words(corpus, ngram_range, type_)
        vocabulary = dict()

        id_ = 0

        for document in bag_of_words:
            for word in document:
                if not word in vocabulary.keys():
                    vocabulary[word] = id_
                    id_ += 1

        sorted_vocabulary = sorted(vocabulary.items(), key=lambda x: x[1])
        feature_matrix = list()

        for document in bag_of_words:
            vector = list()
            for word in sorted_vocabulary:
                try:
                    vector.append(document[word[0]])
                except KeyError:
                    # If word is not present in bag of words, fill respective
                    # column with default value
                    if type_.startswith("binary"):
                        vector.append(False)
                    elif type_.startswith("frequency"):
                        vector.append(0)
                    elif type_.startswith("tfidf"):
                        vector.append(0.0)
            feature_matrix.append(vector)

        return vocabulary, feature_matrix

    def feature_matrix_sklearn(self,
                               corpus,
                               ngram_range=(1, 1),
                               binary=False,
                               type_=0):
        """
        Generate feature matrix for each document of a corpus.
        
        Args:
            corpus (list of str): Raw documents to be transformed into matrix of
                                  bags of words
            ngram_range (tuple of int, int): Start and end range for ngrams
            binary (bool): True if only indicator of presence of word in document
                           is needed, else False
            type_ (int): 0 - frequencies, 1 - tfidf
        
        Returns:
            list of dict of str/tuple of str:int pairs: Matrix of 
                word/ngram:frequency or tfidf measure of a word in text
        """
        if type_ == 0:
            count_vectorizer, feature_matrix = scikit_bag_of_words_frequencies(
                corpus, ngram_range, binary)
        elif type_ == 1:
            count_vectorizer, feature_matrix = scikit_bag_of_words_tfidf(
                corpus, ngram_range)
        return count_vectorizer.vocabulary_, feature_matrix.toarray(
        ), feature_matrix.toarray().tolist()
        return json.loads(res)

if __name__ == "__main__":

    # datafile = "kddcup.data_10_percent.csv"
    # fields = [0, 4, 5, 22, 24, 25, 28, 31, 32, 35, 37, 38]
    # header = False
    # fieldClass = 41
    # k = 23
    # n = 20

    datafile = "kddcup.data_1000.csv"
    header = False
    fields = [0, 4, 5, 22, 24, 25, 28, 31, 32, 35, 37, 38]
    fieldClass = 41
    k = 17
    n = 20

    # datafile = "iris.csv"
    # fields = [0, 1, 2, 3]
    # fieldClass = 4
    # header = True
    # k = 3
    # n = 50

    norm = Normalizer(datafile, header)
    res = norm.run(fields, fieldClass)
    classes = norm.classes
    kMeanClusterer = KMeanClusterer(res, classes, k, n)
    print json.dumps(kMeanClusterer.jsonify(), indent=2, separators=(',', ': '))
                            encoding="utf-8",
                            object_pairs_hook=collections.OrderedDict)
    fin.close()
    abbrevs = abbrev_json["abbreviation-entries"].keys()

    # word tokenizer
    token_json_filepath = os.path.join(lang_path, "token.json")
    wordtok = WordTokenizer(token_json_filepath,
                            abbrev_json["abbreviation-entries"].keys())

    # normalizer
    norm_json_filepath = os.path.join(lang_path, "norm.json")
    alphaexp_json_filepath = os.path.join(lang_path, "alphaexp.json")
    numexp_rule_filepath = os.path.join(lang_path, "numexp.rule")
    norm = Normalizer(norm_json_filepath, alphaexp_json_filepath,
                      numexp_rule_filepath,
                      abbrev_json["abbreviation-entries"])

    # sentence tokenizer
    sentence_json_filepath = os.path.join(lang_path, "sentence.json")
    senttok = SentenceTokenizer(sentence_json_filepath, raw_text_filepath)

    # ========================
    # run
    # ========================
    utts = []
    for sent in senttok.tokenize_iter():
        tokens, classes, puncs = wordtok.tokenize(sent)

        words = []
        for token, cls, punc in zip(tokens, classes, puncs):
from normalization import Normalizer
import nltk
from nltk import bigrams
#================= Loading dataset and normalize it  ===========================
Normalizer = Normalizer()


def loading_dataSet():
    file = open("res/dataset.txt", "r")
    data = file.read()
    file.close()
    docs = data.split("\n")
    types = []
    train = []
    for d in docs:
        d = d.split()
        if len(d) != 0:
            types.append(d[0])
    print('dataset Count = ' + str(len(types)))
    normalized_corpus = Normalizer.normalize_corpus(docs)
    normalized_corpus.remove('')
    counter = 0
    for x in normalized_corpus:
        train.append((x, types[counter]))
        counter = counter + 1
    return train


normalized_dataset = loading_dataSet()
#===============================================================================
#========================= Starting Trainning dataset ==========================
Exemple #16
0
class Test(unittest.TestCase):
    def setUp(self):
        self.datafile = "datasets/spambase_2.data"
        self.normalizer = Normalizer(self.datafile)

        pass

    def tearDown(self):
        pass

    def getDatasetSize(self, datafile):

        norm = Normalizer()
        iris_data_matrix = norm.load_csv(datafile)
        return len(iris_data_matrix)

    def testKMean(self):
        print("** test KMean **")

        # perform initialization
        k = 3
        n = 10
        cols = [3, 4, 5]

        kMeanClusterer = KMeanClusterer(k, n, cols, self.datafile)
        kMeanClusterer.performClustering()

        #total number of lines in the dataset
        dataLines = 0

        data_matrix = self.normalizer.get_csv()
        for row in data_matrix:
            if len(row) > 0:
                dataLines += 1

        #check the number of observations from dataset is kept
        totalObsNb = 0
        for clusterNb in range(kMeanClusterer.getClusterNumber()):
            cluster = kMeanClusterer.getCluster(clusterNb)
            totalObsNb += len(cluster.getObservations())

        self.assertTrue(
            dataLines == totalObsNb,
            "Number of entries in dataset: " + str(dataLines) +
            " is different from number of observations in cluster: " +
            str(totalObsNb))

        # check all normalized entries in the dataset are kept
        index = 0
        for entry in self.normalizer.normalize():
            found = False
            for clusterNb in range(kMeanClusterer.getClusterNumber()):
                cluster = kMeanClusterer.getCluster(clusterNb)
                observations = cluster.getObservations()
                for obs in observations:

                    if obs == entry:
                        found = True
                        break

            self.assertTrue(
                found, "observation " + str(entry) + " not found at index " +
                str(index))
            index += 1

    def testKMeanUpdate(self):
        print("** test KMean update **")

        k = 3
        n = 10
        cols = [3, 4, 5]

        datafile = "datasets/spambase_2.data"
        kMeanClusterer = KMeanClusterer(k, n, cols, datafile)

        kMeanClusterer.assignement()
        kMeanClusterer.update()

        # check existence of centroid
        for i in range(kMeanClusterer.getClusterNumber()):
            current_cluster = kMeanClusterer.getCluster(i)
            self.assertTrue(
                len(current_cluster.getCentroid()) > 0,
                "void centroid for cluster " + str(i))

        # check validity of centroid
        for i in range(kMeanClusterer.getClusterNumber()):
            current_cluster = kMeanClusterer.getCluster(i)
            current_centroid = current_cluster.getCentroid()
            obs = current_cluster.getObservations()
            for j in range(len(current_centroid)):
                tmp = 0
                for i in range(len(obs)):
                    try:
                        tmp += float(obs[i][j])
                    except ValueError:
                        pass  # field is not numeric
                try:
                    value = float(
                        current_centroid[j])  #for test that data is numeric
                    self.assertTrue(
                        tmp / len(obs) == value,
                        "current centroid: " + str(value) +
                        "; actual centroid value: " + str(tmp / len(obs)))
                except ValueError:
                    pass  # field is not numeric

    def testCentroidsComparison(self):
        print("** Test centroids comparison **")

        k = KMeanClusterer(3, 10, [3, 4, 5], "datasets/spambase_2.data")

        centroid1 = tuple([1, 2, 3, 4, 5])
        centroid2 = tuple([1, 2, 3, 4, 5])
        centroid3 = tuple([5, 4, 3, 2, 6])

        centroidsEquals1 = [centroid1, centroid1]
        centroidsEquals2 = [centroid2, centroid2]

        centroidsDifferents1 = [centroid1, centroid1]
        centroidsDifferents2 = [centroid1, centroid3]

        self.assertTrue(
            k.compareCentroids(centroidsEquals1, centroidsEquals2) == False,
            "Centroids should be equals")

        self.assertTrue(
            k.compareCentroids(centroidsDifferents1, centroidsDifferents2),
            "Centroids should be different")

    def testCalculations(self):
        print("** Mean test **")

        arr = [10, 15, 20]
        moy = 15

        self.assertTrue(
            self.normalizer.moyenne(arr) == moy,
            "Mean calculation is uncorrect")

    def testColumnExtraction(self):
        print("** Test column extraction **")

        multi = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 0]]

        single = [1, 3, 5, 7, 9]

        self.assertTrue(
            self.normalizer.column(multi, 0) == single,
            "Extracted column doesn't match")

    def testNormalization(self):
        print("** Test normalization **")

        the_normalizer = Normalizer("datasets/test_normalization.csv")

        normalized = [[0, 0, 0], [1, 1, 1], [0.5, 0.1, 0.9]]

        self.assertTrue(the_normalizer.normalize() == normalized,
                        "Normalized data doesn't match")

    def testCSVIntegrity(self):
        print("** Test CSV Integrity **")

        the_normalizer = Normalizer("datasets/test_normalization.csv")

        data = the_normalizer.get_csv()

        origin_data = [['0', '3', '0'], ['1', '33', '100'], ['0.5', '6', '90']]

        length = 3

        self.assertTrue(data == origin_data, "Data and CSV file doesn't match")
        self.assertTrue(length == the_normalizer.getRowLength(),
                        "Line length doesn't match")
Exemple #17
0
    def setUp(self):
        self.datafile = "datasets/spambase_2.data"
        self.normalizer = Normalizer(self.datafile)

        pass
        if i != c:
            cv2.drawContours(new, [cnts[i]], -1, color, thickness=cv2.FILLED)

    if all(all(p == 255 for p in line) == True for line in new):
        return None

    return new


# Parse arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--imgs_folder", required=True, help="Images folder")
args = vars(ap.parse_args())
imgs_folder = args['imgs_folder']

N = Normalizer(170)

for img in os.listdir(imgs_folder):

    image = cv2.imread("{}/{}".format(imgs_folder, img), 0)

    display("original", image)

    thresh = cv2.threshold(image, 60, 255, cv2.THRESH_BINARY)[1]

    _, cnts, h = cv2.findContours(thresh.copy(), cv2.RETR_TREE,
                                  cv2.CHAIN_APPROX_SIMPLE)

    # Hierarchy: For each contour -> [next, previous, child, parent]
    n = h[0][0][2]  # first child
    c = []  # c -> external contours [contour, area, id]