Example #1
0
def kd_tree_classification(k, lenData, pctTest, params, neightboards):

    clear_csv()

    samples = []

    if (params[0] == "PAIS"):
        samples = generar_muestra_pais(lenData)
    else:
        samples = generar_muestra_provincia(lenData, params[1])
    quantity_for_testing = int(lenData * pctTest)

    normalizer = Normalizer()
    data = normalizer.prepare_data(samples, quantity_for_testing)

    kdTree = Kd_Tree(neightboards)
    firstRound = cross_validation(k, kdTree, data, lenData, "trainingFeatures",
                                  "testingFeatures", "First")

    secondRound = cross_validation(k, kdTree, data, lenData,
                                   "trainingFeatures", "testingFeatures",
                                   "Second")

    secondWithFirst = cross_validation(k, kdTree, data, lenData,
                                       "trainingFeaturesFirstInclude",
                                       "testingFeaturesFirstInclude", "Second")

    normalData = normalizer.get_normal_data()
    predictions = [firstRound, secondRound, secondWithFirst]

    show_accuracy("KD-TREE", predictions)
    make_csv(k, normalData, lenData, pctTest, predictions)
 def verify_address(adress, results, borough):
     zips = Normalizer.select_zipcode_class(Normalizer.get_neighborhood(borough))
     for r in results:
         zip3dig = int(r[2]) / 100
         if zip3dig in zips:
             return r[0], r[1], adress+", "+r[2]
     return None
Example #3
0
def desicion_tree(k, lenData, pctTest, params, threshold):

    clear_csv()

    samples = []

    if (params[0] == "PAIS"):
        samples = generar_muestra_pais(lenData)
    else:
        samples = generar_muestra_provincia(lenData, params[1])
    quantity_for_testing = int(lenData * pctTest)
    normalizer = Normalizer()
    data = normalizer.separate_data_2(samples, quantity_for_testing)

    decisionTree = DecisionTree(threshold)
    firstRound = cross_validation(k, decisionTree, data, lenData,
                                  "trainingFeaturesFirst",
                                  "testingFeaturesFirst", "First")

    secondRound = cross_validation(k, decisionTree, data, lenData,
                                   "trainingFeaturesSecond",
                                   "testingFeaturesSecond", "Second")

    secondWithFirst = cross_validation(k, decisionTree, data, lenData,
                                       "trainingFeaturesFirstInclude",
                                       "testingFeaturesFirstInclude", "Second")

    normalData = normalizer.get_normal_data()
    predictions = [firstRound, secondRound, secondWithFirst]

    show_accuracy("DT", predictions)
    make_csv(k, normalData, lenData, pctTest, predictions)
Example #4
0
def lr_classification(k, lenData, pctTest, l_regulizer=1):

    clear_csv()

    samples = generar_muestra_pais(lenData)
    quantity_for_testing = int(lenData * pctTest)

    normalizer = Normalizer()
    data = normalizer.prepare_data_tensor(samples, quantity_for_testing)

    lrClassifier = LogisticRegression(1, l_regulizer)

    firstRound = cross_validation(k, lrClassifier, data, lenData,
                                  "trainingFeatures", "testingFeatures",
                                  "First")

    lrClassifier = LogisticRegression(2, l_regulizer)
    print("Paso primero")

    secondRound = cross_validation(k, lrClassifier, data, lenData,
                                   "trainingFeatures", "testingFeatures",
                                   "Second")
    print("Paso segundo")

    secondWithFirst = cross_validation(k, lrClassifier, data, lenData,
                                       "trainingFeaturesFirstInclude",
                                       "testingFeaturesFirstInclude", "Second")
    print("Paso tercero")

    normalData = normalizer.get_normal_data()
    # predictions = [firstRound, secondRound, secondWithFirst]
    predictions = [secondRound]
    show_accuracy("LR", predictions)
    make_csv(k, normalData, lenData, pctTest, predictions)
Example #5
0
class Predicter(object):

    def __init__(self, thetafile='theta.csv', datafile='data.csv'):
        try:
            df = pd.read_csv(thetafile)
            self.theta = [
                float(df['theta0'].astype(float)),
                float(df['theta1'].astype(float))
            ]
        except:
            print("the theta file was not found")
            self.theta = None
        self.norm = Normalizer(datafile)

    def predict(self, km):
        """
        Model function:
            f(x) = ax + b
        """
        return self.theta[0] + (self.theta[1] * km)

    def run(self, km):
        if self.theta == None:
            return None
        norm_km = self.norm.normalize_km(km)
        result = self.predict(norm_km)
        return self.norm.denormalize_price(result)
Example #6
0
def train_data_set():
    normalizer = Normalizer()
    data_set = []
    labels = []
    for i in range(0, 256, 8):
        n = normalizer.norm(int(random.uniform(0, 256)))
        data_set.append(n)
        labels.append(n)
    return labels, data_set
def svm_classification(
        k, lenData, pctTest, params, C=1, gamma=1, kernel="rbf"):

    clear_csv()

    samples = []

    print(params)
    if (params[0] == "PAIS"):
        samples = generar_muestra_pais(lenData)
    else:
        samples = generar_muestra_provincia(lenData, params[1])

    quantity_for_testing = int(lenData * pctTest)

    normalizer = Normalizer()
    data = normalizer.prepare_data(samples, quantity_for_testing)

    svmClassifier = SVMClassifier(kernel, C, gamma)

    firstRound = cross_validation(
        k,
        svmClassifier,
        data,
        lenData,
        "trainingFeatures",
        "testingFeatures",
        "First"
    )

    secondRound = cross_validation(
        k,
        svmClassifier,
        data,
        lenData,
        "trainingFeatures",
        "testingFeatures",
        "Second"
    )

    secondWithFirst = cross_validation(
        k,
        svmClassifier,
        data,
        lenData,
        "trainingFeaturesFirstInclude",
        "testingFeaturesFirstInclude",
        "Second"
    )

    normalData = normalizer.get_normal_data()
    predictions = [firstRound, secondRound, secondWithFirst]

    show_accuracy("SVM", predictions)
    make_csv(k, normalData, lenData, pctTest, predictions)
 def preprocess(self):
     tuples = CsvManager.read(self.input)
     num = CsvManager.read_progress()
     print num
     if num == 0:
         CsvManager.write_geo_codes([], self.output)
         CsvManager.write_progress('0')
     self.progress.set_size(len(tuples))
     self.progress.update_progress(num)
     Normalizer.set_tuple(num, tuples)
     return tuples
Example #9
0
 def __init__(self, thetafile='theta.csv', datafile='data.csv'):
     try:
         df = pd.read_csv(thetafile)
         self.theta = [
             float(df['theta0'].astype(float)),
             float(df['theta1'].astype(float))
         ]
     except:
         print("the theta file was not found")
         self.theta = None
     self.norm = Normalizer(datafile)
Example #10
0
 def __init__(self, datafile, outfile, theta=[0, 0], learning_rate=0.1, train_range=1000):
     self.t_history = []
     self.theta = theta
     self.datafile = datafile
     self.output = outfile
     self.data = pd.read_csv(datafile)
     self.learning_rate = learning_rate
     self.range = train_range
     norm = Normalizer(self.datafile)
     self.km = norm.normalize_km_list(self.data['km'])
     self.price = norm.normalize_price_list(self.data['price'])
Example #11
0
def t(network, data):
    '''
    test the model with input data
    :param network: 
    :param data: 
    :return: 
    '''
    normalizer = Normalizer()
    norm_data = normalizer.norm(data)
    predict_data = network.predict(norm_data)
    print '\ttestdata(%u)\tpredict(%u)' % (data,
                                           normalizer.denorm(predict_data))
def lr_classification(k, lenData, pctTest, l_regulizer=1):

    clear_csv()

    samples = generar_muestra_pais(lenData)
    quantity_for_testing = int(lenData * pctTest)

    normalizer = Normalizer()
    data = normalizer.prepare_data_tensor(samples, quantity_for_testing)

    lrClassifier = LogisticRegression(1, l_regulizer)

    firstRound = cross_validation(
        k,
        lrClassifier,
        data,
        lenData,
        "trainingFeatures",
        "testingFeatures",
        "First"
    )

    lrClassifier = LogisticRegression(2, l_regulizer)
    print("Paso primero")

    secondRound = cross_validation(
        k,
        lrClassifier,
        data,
        lenData,
        "trainingFeatures",
        "testingFeatures",
        "Second"
    )
    print("Paso segundo")

    secondWithFirst = cross_validation(
        k,
        lrClassifier,
        data,
        lenData,
        "trainingFeaturesFirstInclude",
        "testingFeaturesFirstInclude",
        "Second"
    )
    print("Paso tercero")

    normalData = normalizer.get_normal_data()
    # predictions = [firstRound, secondRound, secondWithFirst]
    predictions = [secondRound]
    show_accuracy("LR", predictions)
    make_csv(k, normalData, lenData, pctTest, predictions)
def kd_tree_classification(k, lenData, pctTest, params, neightboards):

    clear_csv()

    samples = []

    if (params[0] == "PAIS"):
        samples = generar_muestra_pais(lenData)
    else:
        samples = generar_muestra_provincia(lenData, params[1])
    quantity_for_testing = int(lenData * pctTest)

    normalizer = Normalizer()
    data = normalizer.prepare_data(samples, quantity_for_testing)

    kdTree = Kd_Tree(neightboards)
    firstRound = cross_validation(
        k,
        kdTree,
        data,
        lenData,
        "trainingFeatures",
        "testingFeatures",
        "First"
    )

    secondRound = cross_validation(
        k,
        kdTree,
        data,
        lenData,
        "trainingFeatures",
        "testingFeatures",
        "Second"
    )

    secondWithFirst = cross_validation(
        k,
        kdTree,
        data,
        lenData,
        "trainingFeaturesFirstInclude",
        "testingFeaturesFirstInclude",
        "Second"
    )

    normalData = normalizer.get_normal_data()
    predictions = [firstRound, secondRound, secondWithFirst]

    show_accuracy("KD-TREE", predictions)
    make_csv(k, normalData, lenData, pctTest, predictions)
def desicion_tree(k, lenData, pctTest, params, threshold):

    clear_csv()

    samples = []

    if (params[0] == "PAIS"):
        samples = generar_muestra_pais(lenData)
    else:
        samples = generar_muestra_provincia(lenData, params[1])
    quantity_for_testing = int(lenData * pctTest)
    normalizer = Normalizer()
    data = normalizer.separate_data_2(samples, quantity_for_testing)

    decisionTree = DecisionTree(threshold)
    firstRound = cross_validation(
        k,
        decisionTree,
        data,
        lenData,
        "trainingFeaturesFirst",
        "testingFeaturesFirst",
        "First"
    )

    secondRound = cross_validation(
        k,
        decisionTree,
        data,
        lenData,
        "trainingFeaturesSecond",
        "testingFeaturesSecond",
        "Second"
    )

    secondWithFirst = cross_validation(
        k,
        decisionTree,
        data,
        lenData,
        "trainingFeaturesFirstInclude",
        "testingFeaturesFirstInclude",
        "Second"
    )

    normalData = normalizer.get_normal_data()
    predictions = [firstRound, secondRound, secondWithFirst]

    show_accuracy("DT", predictions)
    make_csv(k, normalData, lenData, pctTest, predictions)
Example #15
0
    def __init__(self, loss_function):
        """
        Parameters          
        ----------
        loss_function : SquaredErrorLoss
            loss function for the MLP
        """

        self.layers = []
        self.activations_functions = []
        self.loss_function = loss_function
        self.normalizer = Normalizer()
        # All errors, returned in end for plotting
        self.errors = []
        self.predict_errors = 0
def writeEntryPointProfiles(outEntryPointHdl, entryPointName, agentTypes):
        epNumber = entryPointName.split("_")[0]
        outEntryPointHdl.write(indent1 + "\n<entrypoint ID=\"ep_" + epNumber + "\" geotype=\"point\" type=\"dynamic\">\n")
        outEntryPointHdl.write(2 * indent2 + "<loader>entry_loader</loader>\n")
        outEntryPointHdl.write(2 * indent2 + "<geometry_id>" + epNumber + "</geometry_id>\n")
        outEntryPointHdl.write(2 * indent2 + "<profiles>\n")
        for agent in agentTypes:
            n = Normalizer()
            agent = n.normalize(agent);
            outEntryPointHdl.write(3 * indent2 + "<profile>\n")
            outEntryPointHdl.write(4 * indent2 + "<agentDistribution pctofentries=\"100.0\" agent_type=\"agent_" + agent + "\"/>\n")
            outEntryPointHdl.write(4 * indent2 + "<timetableReference ref=\"tt_" + agent + "\" scaleFactor=\"1.0\"/>\n")
            outEntryPointHdl.write(3 * indent2 + "</profile>\n")
        outEntryPointHdl.write(2 * indent2 + "</profiles>\n")
        outEntryPointHdl.write(indent1 + "</entrypoint>\n")
Example #17
0
def pruebas():
    # svm_classification(1000, 0.2, C=10, gamma=0.00833333333, kernel="rbf")
    lenData = 2500
    print(lenData)
    print("kernel: ", "sigmoid", " C: ", 1, " G: ", 0.000000001)
    pctTest = 0.2

    # samples = generar_muestra_provincia(lenData, "SAN JOSE")
    # quantity_for_testing = int(lenData*pctTest)

    # normalizer = Normalizer()
    # data = normalizer.prepare_data(samples, quantity_for_testing)

    # svm_classification(10, lenData, pctTest, C=1, gamma=1, kernel="rbf")

    time1 = time.time()

    for i in range(0, 30):
        samples = generar_muestra_pais(lenData)
        quantity_for_testing = int(lenData * pctTest)

        normalizer = Normalizer()
        data = normalizer.prepare_data(samples, quantity_for_testing)
        svm_classification(10,
                           lenData,
                           pctTest,
                           C=1,
                           gamma=0.000000001,
                           kernel="sigmoid")

    time2 = time.time()

    print("ms: ", ((time2 - time1) * 1000.0))

    totalacc = 0.0
    for i in range(0, len(accList), 3):
        totalacc += accList[i][1]
    print("ER: ", 1 - (totalacc / 30.0))

    totalacc = 0.0
    for i in range(1, len(accList), 3):
        totalacc += accList[i][1]
    print("ER: ", 1 - (totalacc / 30.0))

    totalacc = 0.0
    for i in range(2, len(accList), 3):
        totalacc += accList[i][1]
    print("ER: ", 1 - (totalacc / 30.0))
Example #18
0
def main():
    test = [ 61.19499969,  57.31000137,  56.09249878,  61.72000122,
            61.38000107,  64.61000061,  61.93500137,  63.70249939,
            63.57249832,  60.22750092,  61.23249817,  60.35250092,
            65.61750031,  64.85749817,  66.51750183,  66.99749756,
            68.3125    ,  71.76249695,  71.10749817,  71.67250061,
            70.69999695,  69.23249817,  67.09249878,  69.02500153,
            68.75749969,  70.74250031,  70.79250336,  69.64499664,
            71.93250275,  73.44999695,  72.26750183,  73.29000092,
            74.38999939,  75.15750122,  75.93499756,  77.53250122,
            78.75250244,  77.85250092,  76.91249847,  77.38500214,
            76.92749786,  78.73999786,  78.28500366,  79.80750275,
            79.21250153,  79.72250366,  79.18250275,  79.52749634,
            79.5625    ,  79.48500061,  80.46250153,  80.83499908,
            81.27999878,  80.58000183,  82.875     ,  83.36499786,
            85.99749756,  88.20999908,  83.97499847,  84.69999695,
            85.74749756,  88.01999664,  87.89749908,  87.93250275,
            87.43000031,  89.71749878,  91.63249969,  90.01499939,
            91.20999908,  88.40750122,  90.44499969,  91.19999695,
            91.02749634,  91.02749634,  93.46250153,  93.17250061,
            95.34249878,  95.75250244,  95.91999817,  95.47750092,
            97.05750275,  97.72499847,  96.52249908,  96.32749939,
            98.35749817,  97.        ,  97.27249908,  92.84500122,
            92.61499786,  94.80999756,  93.25250244,  95.04000092,
            96.19000244, 106.26000214, 108.9375    , 109.66500092,
            110.0625    , 113.90249634, 111.11250305, 112.72750092]
    
    test = np.array(test)
    test = np.reshape(test, (-1, 1))
    normalizer = Normalizer()
    normalized = normalizer.FeatureScaler.transform(test)
    print(normalized)    
Example #19
0
    def _do_work(self,queue,bbnp,param,action_space,state_space,max_length,seed_id,num_workers,env,continuous_action,cma_param,index_start):
        start = time.time()
        np.random.seed(seed_id)
        
        if self.noise_type == 'Gaussian':
            noises = self.sigma * np.random.randn(num_workers,len(param))
            noisy_param = param + noises
        elif self.noise_type == 'Hadamard':
            h_size = 1<<((max(num_workers,len(param))-1).bit_length())
            h = hadamard(h_size)
            noises = self.sigma*([email protected](np.random.choice([-1,1], h_size)))[:num_workers,:len(param)]
            noisy_param = param + noises
        elif self.noise_type == 'CMA':
            noisy_param = cma_param
        elif self.noise_type == 'CDPP':
            noisy_param = self.buffer[index_start:index_start+num_workers]
        
        fitness = []
        anti_fitness = []
        worker_summary = {}

        for ind in noisy_param:

            #do the roll out
            if self.state_renormalize == True:
                normal = Normalizer(state_space)
                ind_fit = bbnp.roll_out(ind,env,self.env,normal,render = False,state_renormalize = True)
                normal = Normalizer(state_space)
                ind_fit_anti = bbnp.roll_out(-ind,env,self.env,normal,render = False,state_renormalize = True)
            else:
                normal = Normalizer(state_space)
                ind_fit = bbnp.roll_out(ind,env,self.env,normal,render = False,init = False)
                ind_fit_anti = bbnp.roll_out(-ind,env,self.env,normal,render = False,init = False)


            fitness.append(ind_fit)
            anti_fitness.append(ind_fit_anti)

        end = time.time()

        worker_summary['fit'] = fitness
        worker_summary['anti_fit'] = anti_fitness
        worker_summary['seed_id'] = seed_id
        queue.put(worker_summary)
def pruebas():
    # svm_classification(1000, 0.2, C=10, gamma=0.00833333333, kernel="rbf")
    lenData = 2500
    print(lenData)
    print("kernel: ", "sigmoid", " C: ", 1, " G: ", 0.000000001)
    pctTest = 0.2

    # samples = generar_muestra_provincia(lenData, "SAN JOSE")
    # quantity_for_testing = int(lenData*pctTest)

    # normalizer = Normalizer()
    # data = normalizer.prepare_data(samples, quantity_for_testing)

    # svm_classification(10, lenData, pctTest, C=1, gamma=1, kernel="rbf")

    time1 = time.time()

    for i in range(0, 30):
        samples = generar_muestra_pais(lenData)
        quantity_for_testing = int(lenData*pctTest)

        normalizer = Normalizer()
        data = normalizer.prepare_data(samples, quantity_for_testing)
        svm_classification(
            10, lenData, pctTest, C=1, gamma=0.000000001, kernel="sigmoid")

    time2 = time.time()

    print("ms: ", ((time2-time1)*1000.0))

    totalacc = 0.0
    for i in range(0, len(accList), 3):
        totalacc += accList[i][1]
    print("ER: ", 1-(totalacc/30.0))

    totalacc = 0.0
    for i in range(1, len(accList), 3):
        totalacc += accList[i][1]
    print("ER: ", 1-(totalacc/30.0))

    totalacc = 0.0
    for i in range(2, len(accList), 3):
        totalacc += accList[i][1]
    print("ER: ", 1-(totalacc/30.0))
Example #21
0
def svm_classification(k,
                       lenData,
                       pctTest,
                       params,
                       C=1,
                       gamma=1,
                       kernel="rbf"):

    clear_csv()

    samples = []

    print(params)
    if (params[0] == "PAIS"):
        samples = generar_muestra_pais(lenData)
    else:
        samples = generar_muestra_provincia(lenData, params[1])

    quantity_for_testing = int(lenData * pctTest)

    normalizer = Normalizer()
    data = normalizer.prepare_data(samples, quantity_for_testing)

    svmClassifier = SVMClassifier(kernel, C, gamma)

    firstRound = cross_validation(k, svmClassifier, data, lenData,
                                  "trainingFeatures", "testingFeatures",
                                  "First")

    secondRound = cross_validation(k, svmClassifier, data, lenData,
                                   "trainingFeatures", "testingFeatures",
                                   "Second")

    secondWithFirst = cross_validation(k, svmClassifier, data, lenData,
                                       "trainingFeaturesFirstInclude",
                                       "testingFeaturesFirstInclude", "Second")

    normalData = normalizer.get_normal_data()
    predictions = [firstRound, secondRound, secondWithFirst]

    show_accuracy("SVM", predictions)
    make_csv(k, normalData, lenData, pctTest, predictions)
 def fix_acris(self):
     tuples = self.preprocess()
     real_estates = []
     while tuples:
         try:
             t = tuples.pop(0)
             bbl = Normalizer.set_bbl(t[0], t[1], t[2])
             address = t[3]+" "+t[4]
             address = Normalizer.set_address(address, bbl)
             date = Normalizer.set_str_to_epoch(t[5])
             price = t[6]
             real_estates.append((bbl, address, date, price))
         except ValueError:
             self.error_log.open()
             self.error_log.write(t[1]+", "+str(t[0]))
             self.error_log.close()
         except KeyboardInterrupt:
             print ""
             print "Stopped"
             CsvManager.append_geo_codes(real_estates, self.output)
     CsvManager.append_geo_codes(real_estates, self.output)
Example #23
0
	def testLogProbabilities(self):
		probs = numpy.asarray([ [0.01, 0.04],
								[0.1, 0.4],
								[0.4, 0.1],
								[0.00004, 0.00001]])
		
		expectedProbs =  numpy.asarray([ [0.2, 0.8],
								[0.2, 0.8],
								[0.8, 0.2],
								[0.8, 0.2]])
								
		logProbs = numpy.log(probs)
		logExpectedProbs = numpy.log(expectedProbs)
		
		target = Normalizer()
		
		normalizedLogProbs = target.normalizeLogProbabilities(logProbs)
		normalizedProbs = numpy.exp(normalizedLogProbs)
		
		numpy.testing.assert_array_almost_equal(normalizedProbs, expectedProbs)
		numpy.testing.assert_array_almost_equal(normalizedLogProbs, logExpectedProbs)
Example #24
0
 def normalizeTokens(self):
     '''Normalizes each token in the instance and stores the normalized tokens in the token dictionary'''
     normalizedTokenDictionary = {}
     normalizedTokenId = 1
     for tokenObject in self.tokenDictionary.values():
         normalizedTokenObjects = Normalizer.getInstance().normalizeToken(
             tokenObject)
         for normalizedTokenObject in normalizedTokenObjects:
             normalizedTokenDictionary[
                 normalizedTokenId] = normalizedTokenObject
             normalizedTokenId += 1
     return normalizedTokenDictionary
Example #25
0
    def _get_block_dict(self):
        self._block_dict = {}

        for section in self._config.sections():
            self._block_dict[section] = []
            for option in self._config[section]:
                if option not in [
                        'url_base', 'query_interval', 'tmp', 'output',
                        'iterations'
                ]:
                    normalized_option = Normalizer.normalize_name(option)
                    output_xml_file = self._output + '/' + section + '/' + normalized_option + ".xml"
                    self._block_dict[section].append((option, output_xml_file))
Example #26
0
    def _process_article(self, doc_key, article):
        normalizer = Normalizer()
        all_terms = []
        for i in article.findall('*'):
            if 'encoded' in i.tag or 'description' in i.tag:
                if i.text:
                    i = BeautifulSoup(i.text, "lxml")

            if i.text is not None:
                for y in i.text.split():
                    all_terms.append(y)

        cleaned_terms = []
        for i in all_terms:
            normalized = normalizer.normalize_name(i).strip()
            if not len(normalized) < 3 and not normalizer.is_stop_word(
                    normalized) and not normalizer.is_link(normalized):
                cleaned_terms.append(normalized)

        for term in cleaned_terms:
            stemmed_term = self._stemmer.stemWord(term)
            term_id = self._get_or_create_term_id(stemmed_term)
            self._ii_dict.setdefault(term_id, set())
            self._ii_dict[term_id].add(self._document_dict.get(doc_key))
Example #27
0
    def __init__(self):

        self.NaiveBayesClassifier = NaiveBayesClassifier()

        # Sentence Splitters
        self.RuleBasedSentenceSplitter = RuleBasedSentenceSplitter()
        self.MLBasedSentenceSplitter = MLBasedSentenceSplitter()

        # Tokenizers
        self.RuleBasedTokenizer = RuleBasedTokenizer()
        self.MLBasedTokenizer = MLBasedTokenizer()

        # Normalizer
        self.Normalizer = Normalizer()

        # Stemmer
        self.Stemmer = Stemmer()

        # Stopword Eliminators
        self.StaticStopWordEliminator = StaticStopwordRemover()
        self.DynamicStopWordEliminator = DynamicStopWordEliminator()
 def search_lat_long(self):
     tuples = self.preprocess()
     count = 1
     nominatim, google, opencage, bing, tiger = self.build_geocodings()
     while tuples:
         t = tuples.pop(0)
         status, found = self.geocode_process(t, nominatim)
         if not found:
             if status == -1:
                 status, found = self.geocode_process(t, bing)
                 if not found and status == -1:
                     self.geocode_process(t, tiger)
             elif status == -2:
                 i = 1
                 while i < 3:
                     print "Waiting 45' for the "+Normalizer.set_order(str(i))+" time"
                     time.sleep(2700)
                     status, found = self.geocode_process(t, nominatim)
                     if found:
                         continue
                     elif status == -2:
                         i += 1
                     elif status == -3:
                         return
             if count % 100 == 0:
                 for i in range(3):
                     t = tuples.pop(0)
                     status, found = self.geocode_process(t, google)
                     time.sleep(3)
                     if not found:
                         self.geocode_process(t, opencage)
                         time.sleep(3)
                     else:
                         t = tuples.pop(0)
                         self.geocode_process(t, opencage)
                         time.sleep(3)
         count += 1
 def _apply(self):
     '''
     Apply transformations set ...
     '''
     if 'filter' not in self.config:
         raise ValueError('Config map should include "filter."')
     opt = DataParserOpt(**self.config['filter'])
     dr = DataReader(self.map_file, self.data, opt=opt)
     A, y = dr.extract_features()
     if 'sample' in self.config:
         try:
             method = self.config['sample']['method']
             method_args = self.config['sample']['method_args']
             ds = DataSampler(A, y, method, **method_args)
         except Exception as e:
             ds = DataSampler(A, y)
             print('{} or\nSample structure has no method_args'.format(e))
         A, y = ds.sample()
     if 'normalize' in self.config:
         nml = Normalizer(map_file=self.map_file)
         for method_str in self.config['normalize']['methods']:
             method = getattr(nml, method_str)
             A = method(A)
     return A, y
Example #30
0
 def test_distance(self):
     normalizer = Normalizer()
     self.assertEqual(distance([0, 1, 2], [3, 4, 5]), 5.196152422706632)
                print("Accuracy Training:",
                      accuracy.eval({
                          X: train_x,
                          y: train_y
                      }))


##        sess = tf.Session()
##        with sess.as_default():
##            return self.toparty(self.y.eval({self.X: test_x, self.y: test_y}).tolist())
##

samples = generar_muestra_pais(100)
quantity_for_testing = int(100 * 0.2)
normalizer = Normalizer()
data = normalizer.prepare_data(samples, quantity_for_testing)
classes = np.append(data["trainingClassesFirst"],
                    data["testingClassesFirst"],
                    axis=0)
sample = {
    "trainingFeatures": data["trainingFeatures"],
    "trainingClasses": data["trainingClassesFirst"],
    "testingFeatures": data["testingFeatures"],
    "testingClasses": data["testingClassesFirst"]
}
sample2 = {
    "testingFeatures": data["testingFeatures"],
    "testingClasses": data["testingClassesFirst"]
}
print(sample2["testingClasses"])
                    # empieza el entrenamiento
                    _, c = sess.run([optimizer, cost], feed_dict={self.X: train_x,
                                                                  self.y: train_y})
                    cost_in_each_epoch += c
    ##                # you can uncomment next two lines of code for printing cost when training
    ##                if (epoch+1) % display_step == 0:
    ##                    print("Epoch: {}".format(epoch + 1), "cost={}".format(cost_in_each_epoch))

                print("Accuracy Training:", accuracy.eval({X: train_x, y: train_y}))
##        sess = tf.Session()
##        with sess.as_default():
##            return self.toparty(self.y.eval({self.X: test_x, self.y: test_y}).tolist())
##            


samples = generar_muestra_pais(100)
quantity_for_testing = int(100*0.2)
normalizer = Normalizer()
data = normalizer.prepare_data(samples, quantity_for_testing)
classes = np.append(
        data["trainingClassesFirst"],
        data["testingClassesFirst"],
        axis=0
    )
sample = { "trainingFeatures": data["trainingFeatures"], "trainingClasses": data["trainingClassesFirst"],"testingFeatures": data["testingFeatures"], "testingClasses": data["testingClassesFirst"]}
sample2 = { "testingFeatures": data["testingFeatures"], "testingClasses": data["testingClassesFirst"]}
print(sample2["testingClasses"])
prueba = logistic_regression_classifier(1,classes)
prueba.train(sample)
print(prueba.classify(sample2))
Example #33
0
    def train(self):

        #need to put into function
        
        if not os.path.exists("./" + param_dir):
            os.mkdir(param_dir)

        for test in range(self.num_test):
            fit_list = []
            iteration_list = []

            plt.ion()
            plt.show()

            #set up environment
            env = gym.make(self.env)


            if self.continuous_action == True:
                action_space = env.action_space.shape[0]
            else:
                action_space = env.action_space.n
            state_space = env.observation_space.low.size

            param_dict = {}
            #get param for the len
            best_reward = 0


            bbnp = bb_numpy(action_space,state_space,self.max_length,continuous_action = self.continuous_action,state_renormalize = self.state_renormalize)
            state = np.zeros(state_space)[None,...]
            bbnp.forward_propagate(state,init = True)

            #initialize param
            param = np.array(bbnp.get_flat_param())
            SGD_ = SGD(param, self.lr)
            
            cma_es = cma.CMAEvolutionStrategy(param,self.sigma,{'popsize': self.num_perturbations,})



            for iteration in range(self.iterations):
                
                ts = time.time()                
                cma_param = np.array(cma_es.ask())


                if self.method_type == 'CDPP':
                    if iteration == 0:

                        X = np.random.randn(self.num_perturbations*3,len(param))
                        cond_indices = cond_kdpp([], X, k = self.num_perturbations)
                        self.buffer = X[cond_indices]

                    else:
                        dists = np.linalg.norm(self.buffer-param,axis=1)
                        num_closest= int(self.perc_reuse*self.num_perturbations)

                        closest_indices = dists.argsort()[:num_closest]
                        reused_samples = self.buffer[closest_indices]
                        X = np.random.randn(self.num_perturbations*3,len(param)) + param
                        cond_indices = cond_kdpp(self.buffer[closest_indices],X,k=(self.num_perturbations-num_closest))

                        self.buffer = np.vstack((self.buffer[closest_indices],X[cond_indices]))

                        all_indices = []





                queue = Queue()
                num_workers = []
                num_perts = self.num_perturbations
                shared_amt = self.num_perturbations//self.num_cpu
                if self.noise_type == 'CDPP' and iteration!=0:
                    num_perts = self.num_perturbations - int(self.perc_reuse*self.num_perturbations)
                    shared_amt = (self.num_perturbations - int(self.perc_reuse*self.num_perturbations))//self.num_cpu
                while num_perts > shared_amt:
                        num_perts -= shared_amt
                        num_workers.append(shared_amt)
                num_workers.append(num_perts)
                start_indices = [0]
                # print('num_workers=',num_workers)
                for i in range(1,len(num_workers)):
                   start_indices.append(start_indices[i-1]+num_workers[i-1])

                cma_param_slicer = [0]
                cma_param_slicer.extend(num_workers)
                cma_param_slicer = np.cumsum(cma_param_slicer)
                
                seed_id =  np.random.randint(np.iinfo(np.int32(10)).max, size=len(num_workers))


                workers = [Process(target = self._do_work,args = (queue,bbnp,param,action_space,state_space,self.max_length,seed_id[i],num_workers[i],env,self.continuous_action,cma_param[cma_param_slicer[i]:cma_param_slicer[i+1],:],start_indices[i])) for i in range(len(seed_id))]


                for worker in workers:
                    worker.start()

                results = [queue.get() for p in workers]
                
                # Swapping this with the above line so deadlock is avoided
                for worker in workers:
                    worker.join()
                if self.noise_type == 'CDPP':
                    if iteration != 0:
                        old_pert_fitness = pert_fitness[closest_indices]
                        old_anti_pert_fitness = anti_pert_fitness[closest_indices]
                
                pert_fitness,anti_pert_fitness,seed_id = get_info_summary(results)
                pert_fitness = np.array(pert_fitness)[...,None]
                anti_pert_fitness = np.array(anti_pert_fitness)[...,None]

                if self.noise_type == 'CDPP':
                    if iteration != 0:
                        
                        pert_fitness = np.vstack((old_pert_fitness,pert_fitness))
                        anti_pert_fitness = np.vstack((old_anti_pert_fitness,anti_pert_fitness))


                if self.noise_type in ['Gaussian','Hadamard']:
                    noises = get_noise_matrices(seed_id,num_workers,len(param),self.sigma,self.noise_type)

                #record average_fit
                average_fit = np.sum(pert_fitness)/self.num_perturbations

                fit_list.append(average_fit)
                iteration_list.append(iteration)

                #dynamic plot the graph
                plt.plot(iteration_list,fit_list,'r')
                plt.draw()
                plt.pause(0.3)

                #Ranking best
                if self.method_type == 'Rank':

                    top_ind = np.sort(np.argsort(pert_fitness,axis =0)[-self.best:][::-1],axis = 0).flatten()
                    pert_fitness = pert_fitness[top_ind]
                    gradient = (1 / len(top_ind) / self.sigma * (noises[top_ind,:].T@pert_fitness)).flatten()
                    SGD_gradient = SGD_.get_gradients(gradient)
                    param = param + SGD_gradient
                    print('param',param)


                #Vanilla
                elif self.method_type == 'Vanilla':
                    gradient = (1 / self.num_perturbations / self.sigma * (noises.T@pert_fitness)).flatten()
                    SGD_gradient = SGD_.get_gradients(gradient)
                    param = param + SGD_gradient
                    
                #CMA
                elif self.method_type == 'CMA':
                    cma_es.tell(cma_param,-pert_fitness[:,0] - compute_weight_decay(0.01,cma_param))
                    param = cma_es.result[5] # mean of all perturbations - for render and save - not used to update new space

                #ARS
                elif self.method_type == 'ARS':
                    fb_fitness = np.hstack((pert_fitness,anti_pert_fitness))
                    top_ind = (np.argsort(np.max(fb_fitness,axis = 1,keepdims = True),axis = 0)[-self.best:][::-1]).flatten()
                    fit_diff = pert_fitness - anti_pert_fitness
                    reward_noise = np.std(np.vstack((pert_fitness,anti_pert_fitness)))
                    fit_diff = fit_diff[top_ind]
                    gradient = (1 / len(top_ind) / self.sigma/reward_noise * (noises[top_ind,:].T@fit_diff)).flatten()
                    SGD_gradient = SGD_.get_gradients(gradient)
                    param = param + SGD_gradient



                #CDPP
                elif self.method_type == 'CDPP':
                    cond_noise = self.buffer - param
                    if iteration != 0:
                        noises = np.vstack((reused_samples,cond_noise))
                    else:
                        noises = cond_noise


                    top_ind = np.sort(np.argsort(pert_fitness,axis =0)[-self.best:][::-1],axis = 0).flatten()
                    best_pert_fitness = pert_fitness[top_ind]
                    
                    gradient = (1 / self.num_perturbations / self.sigma * ((self.buffer - param)[top_ind,:].T@best_pert_fitness)).flatten()
                   
                    param = param + self.lr * gradient

                if iteration % self.video_save_interval == 0 and iteration !=0:
                    normal = Normalizer(state_space)
                    video_env = gym.wrappers.Monitor(env, './videos/' + str(self.env) + '/'+ str(self.method_type) + '_perturbations_' + str(self.num_perturbations) + '_' +'state_renormalize_' + str(self.state_renormalize)+ '_Simga_' + str(self.sigma) + "_best_" + str(self.best) + "_iter_" +str(iteration) + "_test_" + str(self.test))
                    bbnp.roll_out(param,video_env,self.env,normal,render = True)
                print("-" * 100)

                te = time.time()

                #print the results
                print('iteration: {} | average_fit: {} | # params: {} | time: {:2.2f}s'.format(iteration,average_fit,len(param),te-ts))


                self.save_param(path,name,param,average_fit,iteration)
Example #34
0
class MLP:
    """
    This is the class used for processing the data and creating a Multi
       layered perceptron.
       
       ...

       Attributes
    ----------
    layers : list of the class Layer
        a list of the layers included in the MLP
    activations_functions : list of the class Activation
        a list of the activations included in the MLP
    loss_function : Loss
        the loss function defined for the MLP
    normalizer : Normalizer
        a normalizer class to normalize the data used in the MLP 
    errors : float []
        list for the average error for each training epoch in the MLP
    predict_errors : float 
        a float for the  average error of all data rows in predict

    Methods
    -------
    add_layer(self, size, activation_function)
        adds a layer to the MLP
    _backprop(self, d_loss, learning_rate)
        backpropagates through the MLP
    train(self, x, y, learning_rate=0.01, n_epochs=10)
        method for training the MLP
    predict(self, x, y)
        method for predicting given data
    plot(self, dataset_name, nr_epochs, y_test, y_pred, train_time, pred_time)
        method for plotting result from MLP
    
    """
    def __init__(self, loss_function):
        """
        Parameters          
        ----------
        loss_function : SquaredErrorLoss
            loss function for the MLP
        """

        self.layers = []
        self.activations_functions = []
        self.loss_function = loss_function
        self.normalizer = Normalizer()
        # All errors, returned in end for plotting
        self.errors = []
        self.predict_errors = 0

    def add_layer(self, size, activation_function):
        """Adds a layer to the MLP.

        Parameters
        ----------
        size : int
            The size of the layer to be added
        activation_function : Activation
            The activation function for the layer to be added
        """

        n_inputs = size
        if self.layers:
            n_inputs = self.layers[-1].n_nodes

        self.layers.append(Layer(size, n_inputs))
        self.activations_functions.append(activation_function)

    def _backprop(self, d_loss, learning_rate):
        """Backpropagates through the MLP.

        Parameters
        ----------
        d_loss : numpy array
            The derivated loss
        learning_rate : float
            The rate at which the MLP learns
        """

        for i in range(len(self.layers) - 1, 1, -1):
            input_data = self.layers[i].get_output()
            loss = self.activations_functions[i].backward(input_data).reshape(
                len(input_data), -1) * d_loss
            d_loss = self.layers[i].backprop(loss, learning_rate)

    def train(self, x, y, learning_rate=0.01, n_epochs=10):
        """Backpropagates through the MLP.

        Parameters
        ----------
        n_epochs : int
            The number of epochs the MLP will run
        x : floats [][]
            The input data for training the MLP
        y : floats [][]
            The target data for each training data
        learning_rate : float
            The rate at which the MLP learns
        """

        # Normalize data
        self.normalizer.fit(x, y)
        x, y = self.normalizer.normalize(x, y)

        # Main loop, handles forward and backprop
        for _ in range(n_epochs):
            output = x
            for j, layer in enumerate(self.layers):
                # Activation forward, sets the input and output for each layers
                output = self.activations_functions[j].forward(
                    layer.forward(output))
                layer.set_output(output)

            # Current error for the epoch is saved
            error = (np.average(self.loss_function.forward(output, y)))
            self.errors.append(error)

            # Get the derivative loss from the output node
            d_loss = self.loss_function.backward(output, y)
            # Backprop
            self._backprop(d_loss, learning_rate)

    def predict(self, x, y):
        """Predicts a given input with the MLP.

        Parameters
        ----------
        x : floats [][]
            The input data for training the MLP
        y : floats [][]
            The target data for each training data
        """

        # Normalize data
        x, y = self.normalizer.normalize(x, y)

        output = x
        for j, layer in enumerate(self.layers):
            output = self.activations_functions[j].forward(
                layer.forward(output))

        self.predict_errors = (np.average(self.loss_function.forward(
            output, y)))

        return self.normalizer.renormalize(output)

    def plot(self, dataset_name, nr_epochs, y_test, y_pred, train_time,
             pred_time):
        """Plots the results from a dataset trained and predicted with the MLP.

        Parameters
        ----------
        dataset_name : str
            The name of the dataset being plot
        nr_epochs : int
            The number of epochs the MLP has been trained
        y_test : floats [][]
            The target data for each training data
        y_pred : floats [][]
            The predicted result from the MLP
        train_time : float
            The elapsed time of the MLP training on the dataset
        pred_time : float
            The elapsed time of the MLP prediction on the dataset
        """

        xy_max = max(max(y_pred), max(y_test))
        xy_min = min(min(y_pred), min(y_test))

        plt.figure(figsize=(10, 6))
        plt.suptitle("Data: {}".format(dataset_name))
        plt.subplot(121)
        plt.scatter(np.arange(1, nr_epochs + 1), self.errors, label='loss')
        plt.title("Average Loss by epoch")
        plt.xlabel('Epochs')
        plt.ylabel('Loss')

        pred_info = "Total pred time: {:.6f}\n".format(pred_time)
        train_info = "Total train time: {:.2f}\n".format(train_time)
        average_loss = "Pred MSE: {:.2f}\n".format(
            np.average(self.predict_errors))
        epochs = "Number of epochs: {}".format(nr_epochs)

        text = pred_info + train_info + average_loss + epochs

        plt.annotate(text,
                     xy=(1, 1),
                     xytext=(-15, -15),
                     fontsize=10,
                     xycoords='axes fraction',
                     textcoords='offset points',
                     bbox=dict(facecolor='white', alpha=0.8),
                     horizontalalignment='right',
                     verticalalignment='top')

        plt.subplot(122)
        plt.scatter(y_test, y_pred)
        plt.xlim(xy_min, xy_max)
        plt.ylim(xy_min, xy_max)
        plt.xlabel("Target")
        plt.ylabel("Predicted")
        plt.title("Actual Y vs Predicted ")

        plt.show()
while line:
    strLine = line.split(delim)
    try:
        entryPointNumber = int(strLine[0])
        epName = genericAgentTypeName(strLine[2])
        activityEntryPointName = epName + "_" + strLine[0].replace(" ", "_") + "_" + strLine[1].replace(" ", "_")
        entryPointName = strLine[0].replace(" ", "_") + "_" + strLine[1].replace(" ", "_")
        totalEntries = int(strLine[16])
        if totalEntries > 0:
            ## Generating list of agenttypes over entry points
            if not entryPointName in agentTypesAtEntryPoints.keys():
                agentTypesAtEntryPoints[entryPointName] = []
            agentTypesAtEntryPoints[entryPointName].append(activityEntryPointName)
            ## Writing Time Tables
            
            n = Normalizer()
            
            timetableHeader = "\n<timetableData ID=\"tt_" + epName + "_" + n.normalize(entryPointName) + "\">"
            timetableNote = "<!-- Time Table for Entrypoint ID: " + str(entryPointNumber) + " -->"
            timetableFooter = "</timetableData>"
            outTTHdl.write(indent1 + timetableHeader + "\n")
            outTTHdl.write(indent2 + timetableNote + "\n")
            time1 = 0
            timeIncrement = 2
            for i in range(3, 15):
                try:
                    pctCount = int(strLine[i])
                except:
                    pctCount = 0
                time2 = time1 + timeIncrement
                count = int(round(totalEntries * (pctCount / 100.0)))
Example #36
0
class LDA(object):
    def __init__(self, vectorizer="tfidf", stopwords_path=None):
        self.vectorizer = vectorizer
        self.stopwords_path = stopwords_path
        self.normalizer = Normalizer(self.stopwords_path)
        self.normalizer.load_stopwords()
        
    @print_run_time
    def tranform_corpora(self, corpora_dir, corpora_path, id2word_path):
        """转化语料

        1. 从{corpora_dir}文件夹下提取所有.txt文件作为语料
        2. 文件总每一行经过预处理后作为一行,存入{corpora_path}文件
        3. 保存id2word到{id2word_path}文件
        """
        self.corpora_dir = corpora_dir
        self.corpora_path = corpora_path
        self.id2word_path = id2word_path
        self._transform_corpora(self.normalizer, self.corpora_dir, self.corpora_path, self.id2word_path)
        
    @print_run_time
    def train_lda(self, model_dir, model_fname, num_topics):
        self.model_dir = model_dir
        self.model_fname = model_fname
        self.num_topics = num_topics
        self.model = self._train_lda(self.vectorizer, self.corpora_path, self.id2word_path, 
                                     self.model_dir, self.model_fname, self.num_topics)
        self.model_path = os.path.join(self.model_dir, self.vectorizer, self.model_fname)

    @staticmethod
    def _transform_corpora(normalizer, corpora_dir, corpora_path, id2word_path):
        """转化语料

        1. 从{corpora_dir}文件夹下提取所有.txt文件作为语料
        2. 文件总每一行经过预处理后作为一行,存入{corpora_path}文件
        3. 保存id2word到{id2word_path}文件

        Args:
            corpora_dir(path) :- 语料文件所在的文件夹
            corpora_path(path) :- 汇总所有语料的.txt文件
            id2word_path(path) :- gensim的字典文件
        """
        corpora = []
        if not os.path.isdir(corpora_dir):
            raise OSError(corpora_dir, "doesn't exist")

        if not os.path.isdir(os.path.dirname(corpora_path)):
            raise OSError(os.path.dirname(corpora_path), " doesn't exist")

        if not os.path.isdir(os.path.dirname(os.path.dirname(id2word_path))):
            raise OSError("the grandparent directory of ", id2word_path, " doesnt't exist")

        output_tfidf = open(corpora_path, 'a', encoding="utf8")
        for file in os.listdir(corpora_dir):
            if file.endswith('txt'):
                file = os.path.join(corpora_dir, file)
                print(file+' read')
                with open(file, encoding="utf8") as f:
                    lines = f.readlines()
                    for line in lines:
                        words = normalizer.tokenize(line)
                        if len(words) > 0:
                            corpora.append(words) 
                            output_tfidf.write('{}\n'.format(" ".join(words)))
                f.close()

        output_tfidf.close()    
        id2word = gensim.corpora.Dictionary(corpora)

        parent_dir = os.path.dirname(id2word_path)
        make_dir(parent_dir)
        if not os.path.isfile(id2word_path):
            id2word.save(id2word_path) 
            print('id2word saved') 
        else:
            print(id2word_path, ' already exists')

    @staticmethod
    def _train_lda(vectorizer, corpora_path, id2word_path, model_dir, model_fname=model_fname, num_topics=10):
        """训练和保存基于tfidf的lda模型

        基于{corpora_path}文件保存的语料和{id2word_path}保存的gensim字典来训练lda_tfidf模型,

        保存该模型到{model_dir}文件夹下

        Args:
            vectorizer(str) :- 向量化方法, choices=["bow", "tfidf"]
            corpora_path(path) :- 保存语料的.txt文件
            id2word_path(path) :- 保存gensim字典的文件
            model_dir(path) :- 保存gensim LDA模型的文件夹
            model_fname(path) :- 模型文件名
            num_topics(int) :- lda的超参,主题数
        """
        try:
            assert vectorizer in ["bow", "tfidf"]
        except AssertionError:
            raise AssertionError("vectorizer must be bow or tfidf")
        
        if not os.path.isdir(model_dir):
            raise OSError(model_dir, "doesn't exist")

        corpora = []
        with open(corpora_path, 'r', encoding="utf8") as fp:
            lines = fp.readlines()
            for line in lines:
                corpora.append(line.strip())
        id2word = gensim.corpora.Dictionary.load(id2word_path)
        corpus = [id2word.doc2bow(corpus.split(" ")) for corpus in corpora]
        
        # tfidf的话需要计算idf
        if vectorizer == "tfidf":
            MmCorpus.serialize(corpus_tfidf_mm, corpus)
            corpus = MmCorpus(corpus_tfidf_mm)
            
        model = gensim.models.LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)

        model_path = os.path.join(model_dir, vectorizer)
        make_dir(model_path)
        model_path = os.path.join(model_path, model_fname)
        if not os.path.isfile(model_path):
            model.save(model_path)
            print('model saved')
        else:
            print(f"{model_path} already exists")
        return model

    @staticmethod
    def analysis_topics(fname):
        """将各个主题的关键字打印出来

        把self.model.print_topics(10)保存到fname后打印出来
        """
        f = open(fname, 'r')
        lines = f.readlines()
        for line in lines:
            print(re.findall(r'\"([^\"]*)\"', line))

    @staticmethod
    def _short_long_similarity(model_path, normalizer, id2word_path, short, long):
        """计算长短文本的相似度

        Args:
            model_path(path) :- gensim.models.ldamodel的保存路径
            id2word_path(path) :- gensim.corpora.Dictionary的保存路径
            short(str) :- 短文本
            long(str) :- 长文本
        Returns:
            prob(float) :- 长短文本的匹配度
            theta(iterables) :- 长文本在lda模型下的主题分布概率,
                                每个元素为(主题的序号, 对应主题的概率)
        """
        lda = gensim.models.LdaModel.load(model_path)
        id2word = gensim.corpora.Dictionary.load(id2word_path)
        theta = lda[id2word.doc2bow(normalizer.tokenize(long))]
        short = normalizer.tokenize(short)
        short = set(short)
        short = id2word.doc2idx(short)
        prob = 0
        for word in short:
            prob_w = sum([lda.expElogbeta[k][word]*1000 * p_zk for (k, p_zk) in theta])
            prob += math.log(prob_w)
        prob = prob/len(short)
        prob -= math.log(1000)
        return prob, theta

    def short_long_sim(self, short, long):
        """用self.model计算长短文本相似度
        """
        return self._short_long_similarity(self.model_path, self.normalizer, self.id2word_path, short, long)
Example #37
0
	def GetJointLogProbability(logProbabilities, labels, ids):
		modelAggregator = ModelAggregator()
		normalizer = Normalizer()
		unormalizedLogProbs, labels, ids = modelAggregator.Aggregate(logProbabilities, labels, ids)
		
		return (normalizer.normalizeLogProbabilities(unormalizedLogProbs), labels,ids)
Example #38
0
 def __init__(self, vectorizer="tfidf", stopWordsFile=None):
     self.vectorizer = vectorizer
     self.stopWordsFile = stopWordsFile
     self.normalizer = Normalizer(self.stopWordsFile)
     self.normalizer.load_stopWords()
Example #39
0
class LDA():
    def __init__(self, vectorizer="tfidf", stopWordsFile=None):
        self.vectorizer = vectorizer
        self.stopWordsFile = stopWordsFile
        self.normalizer = Normalizer(self.stopWordsFile)
        self.normalizer.load_stopWords()

    @print_run_time
    def create_corporaListAndCorporaText(self, corpora_source, corpora_txt,
                                         id2word_fname):
        self.corpora_source = corpora_source
        self.corpora_txt = corpora_txt
        self.id2word_fname = id2word_fname
        self._create_corporaListAndCorporaText(self.normalizer,
                                               self.corpora_source,
                                               self.corpora_txt,
                                               self.id2word_fname)

    @print_run_time
    def createAndSave_lda(self, ldaModel_save_repo, num_topics):
        self.ldaModel_save_repo = ldaModel_save_repo
        self.num_topics = num_topics
        if self.vectorizer == "tfidf":
            self.model = self._createAndSave_lda_tfidf(self.corpora_txt,
                                                       self.id2word_fname,
                                                       self.ldaModel_save_repo,
                                                       self.num_topics)
            self.model_fname = self.ldaModel_save_repo + '/gensim_tfidf/crawl_news.model'
        else:
            self.model = self._createAndSave_lda_bow(self.corpora_txt,
                                                     self.id2word_fname,
                                                     self.ldaModel_save_repo,
                                                     self.num_topics)
            self.model_fname = self.ldaModel_save_repo + '/gensim_bow/crawl_news.model'

    @staticmethod
    def _create_corporaListAndCorporaText(normalizer, corpora_source,
                                          corpora_txt, id2word_fname):
        ''' 从{corpora_source}文件夹下提取所有.txt文件作为语料

        文件总每一行经过预处理后作为一行,存入{corpora_txt}文件

        并保存id2word到{id2word_fname}文件

        Args:
            corpora_source(path) :- 语料文件所在的文件夹
            corpora_txt(path) :- 汇总所有语料的.txt文件
            id2word_fname(path) :- gensim的字典文件
        '''
        corpora = []
        if not os.path.isdir(corpora_source):
            raise OSError(corpora_source, "doesn't exist")

        if not os.path.isdir(os.path.dirname(corpora_txt)):
            raise OSError(os.path.dirname(corpora_txt), " doesn't exist")

        if not os.path.isdir(os.path.dirname(os.path.dirname(id2word_fname))):
            raise OSError("the grandparent directory of ", id2word_fname,
                          " doesnt't exist")

        output_tfidf = open(corpora_txt, 'a', encoding="utf8")
        for file in os.listdir(corpora_source):
            if file.endswith('txt'):
                file = os.path.join(corpora_source, file)
                print(file + ' read')
                with open(file, encoding="utf8") as f:
                    lines = f.readlines()
                    for line in lines:
                        words = normalizer.tokenize(line)
                        if len(words) > 0:
                            corpora.append(words)
                            output_tfidf.write('{}\n'.format(" ".join(words)))
                f.close()

        output_tfidf.close()
        id2word = gensim.corpora.Dictionary(corpora)

        parent_dir = os.path.dirname(id2word_fname)
        make_dir(parent_dir)
        if not os.path.isfile(id2word_fname):
            id2word.save(id2word_fname)
            print('id2word saved')
        else:
            print(id2word_fname, ' already exists')

    @staticmethod
    def _createAndSave_lda_bow(corpora_txt,
                               id2word_fname,
                               ldaModel_save_repo,
                               num_topics=10):
        '''  训练和保存基于bow的lda模型

        基于{corpora_txt}文件保存的语料和{id2word_fname}保存的gensim字典来训练lda_bow模型,

        主题数为{num_topics}

        保存该模型到{ldaModel_save_repo}文件夹下

        Args:
            corpora_txt(path) :- 保存语料的.txt文件
            id2word_fname(path) :- 保存gensim字典的文件
            ldaModel_save_repo(path) :- 保存gensim LDA模型的文件夹
            num_topics(int) :- lda的超参,主题数
        '''
        if not os.path.isdir(ldaModel_save_repo):
            raise OSError(ldaModel_save_repo, "doesn't exist")

        corpora = []
        with open(corpora_txt, 'r', encoding="utf8") as fp:
            lines = fp.readlines()
            for line in lines:
                corpora.append(line.strip())
        id2word = gensim.corpora.Dictionary.load(id2word_fname)
        corpus = [id2word.doc2bow(corpus.split(" ")) for corpus in corpora]
        lda_bow = gensim.models.LdaModel(corpus=corpus,
                                         id2word=id2word,
                                         num_topics=num_topics)

        make_dir(ldaModel_save_repo + '/gensim_bow')
        if not os.path.isfile(ldaModel_save_repo +
                              '/gensim_bow/crawl_news.model'):
            lda_bow.save(ldaModel_save_repo + '/gensim_bow/crawl_news.model')
            print('lda_bow saved')
        else:
            print(ldaModel_save_repo,
                  '/gensim_bow/crawl_news.model already exists')

        return lda_bow

    @staticmethod
    def _createAndSave_lda_tfidf(corpora_txt,
                                 id2word_fname,
                                 ldaModel_save_repo,
                                 num_topics=10):
        '''  训练和保存基于tfidf的lda模型

        基于{corpora_txt}文件保存的语料和{id2word_fname}保存的gensim字典来训练lda_tfidf模型,

        主题数为{num_topics}

        保存该模型到{ldaModel_save_repo}文件夹下

        Args:
            corpora_txt(path) :- 保存语料的.txt文件
            id2word_fname(path) :- 保存gensim字典的文件
            ldaModel_save_repo(path) :- 保存gensim LDA模型的文件夹
            num_topics(int) :- lda的超参,主题数
        '''
        if not os.path.isdir(ldaModel_save_repo):
            raise OSError(ldaModel_save_repo, "doesn't exist")

        corpora = []
        with open(corpora_txt, 'r', encoding="utf8") as fp:
            lines = fp.readlines()
            for line in lines:
                corpora.append(line.strip())
        id2word = gensim.corpora.Dictionary.load(id2word_fname)

        MmCorpus.serialize(
            'corpus_tfidf.mm',
            [id2word.doc2bow(corpus.split(" ")) for corpus in corpora])
        mm = MmCorpus('corpus_tfidf.mm')
        lda_tfidf = gensim.models.LdaModel(corpus=mm,
                                           id2word=id2word,
                                           num_topics=num_topics)

        make_dir(ldaModel_save_repo + '/gensim_tfidf')
        if not os.path.isfile(ldaModel_save_repo +
                              '/gensim_tfidf/crawl_news.model'):
            lda_tfidf.save(ldaModel_save_repo +
                           '/gensim_tfidf/crawl_news.model')
            print('lda_tfidf saved')
        else:
            print(ldaModel_save_repo,
                  '/gensim_tfidf/crawl_news.model already exists')
        return lda_tfidf

    @staticmethod
    def analysis_topics(fname):
        '''将各个主题的关键字打印出来

        把self.model.print_topics(10)保存到fname后打印出来
        '''
        f = open(fname, 'r')
        lines = f.readlines()
        for line in lines:
            print(re.findall(r'\"([^\"]*)\"', line))

    @staticmethod
    def _short_long_similarity(lda_fname, normalizer, id2word_fname, short,
                               long):
        '''计算长短文本的相似度
        Args:
            lda_fname(path) :- gensim.models.ldamodel的保存路径
            id2word_fnmae(path) :- gensim.corpora.Dictionary的保存路径
            short(str) :- 短文本
            long(str) :- 长文本
        Returns:
            prob(float) :- 长短文本的匹配度
            Theta(iterables) :- 长文本在lda模型下的主题分布概率,
                                每个元素为(主题的序号, 对应主题的概率)
        '''
        lda = gensim.models.LdaModel.load(lda_fname)
        id2word = gensim.corpora.Dictionary.load(id2word_fname)
        Theta = lda[id2word.doc2bow(normalizer.tokenize(long))]
        short = normalizer.tokenize(short)
        short = set(short)
        short = id2word.doc2idx(short)
        prob = 0
        for word in short:
            prob_w = sum([
                lda.expElogbeta[k][word] * 1000 * p_zk for (k, p_zk) in Theta
            ])
            prob += math.log(prob_w)
        prob = prob / len(short)
        prob -= math.log(1000)
        return prob, Theta

    def short_long_sim(self, short, long):
        return self._short_long_similarity(self.model_fname, self.normalizer,
                                           self.id2word_fname, short, long)
    def writeStates(self, outStateHdl, entryPointID, entryPointName, wayPointProbabilities, countFunctions):
        ## Entry-/waypoint probability states
        
        if entryPointID == 10101:
            pass
            # import pdb;pdb.set_trace()
        
        i = 2
        propSum = 0
        for wayPoint in wayPointProbabilities:
            if wayPoint[i] > 0:
                propSum += 1
                
        if 0 == 0:
        #if propSum > 0:
            entryPointName = entryPointName.replace(" ", "_")
            outStateHdl.write("<stateType ID=\"" + "entryState_" + self.genericAgentId + "_" + str(entryPointID) + "_" + entryPointName + "\">\n")
            outStateHdl.write(indent2 + "<iconcolour>" + self.iconColor + "</iconcolour>\n")
            outStateHdl.write(indent2 + "<speed>\n")
            outStateHdl.write(indent2 + indent2 + "<alpha>" + str(self.speedAlpha) + "</alpha>\n")                  
            outStateHdl.write(indent2 + indent2 + "<beta>" + str(self.speedBeta) + "</beta>\n")                  
            outStateHdl.write(indent2 + "</speed>\n")
            outStateHdl.write(indent2 + "<headOnFactor>1.0</headOnFactor>\n");
            outStateHdl.write(indent2 + "<agentMode>network</agentMode>\n")
            outStateHdl.write(indent2 + "<type>normal</type>\n")
            outStateHdl.write(indent2 + "<instantiationFunctions>\n" + indent2 + indent2 + "<function name=\"selectWaypoint\">\n")
            for wayPoint in wayPointProbabilities:
                if wayPoint[i] > 0:
                    blanks = " "
                    for cc in range(3 - len(str(wayPoint[i]))):
                        blanks += " " 
                    n = Normalizer()          
                    outStateHdl.write(indent2 + indent2 + "<waypoint id=\"" + str(wayPoint[0]) + "\" probability=\"" + str(wayPoint[i]) + "\" />" + blanks + "<!-- " + n.normalize(wayPoint[1]) + "-->\n")
            outStateHdl.write(indent2 + indent2 + "</function>\n" + indent2 + "</instantiationFunctions>\n")
            countFunctions += 1
            outStateHdl.write("</stateType>\n\n") 
        i += 1
        ## Pause at waypoint behaviour
        entryPointName = entryPointName.replace(" ", "_")
        outStateHdl.write("<stateType ID=\"" + "atWayPointState_" + self.genericAgentId + "_" + str(entryPointID) + "_" + entryPointName + "\">\n")
        outStateHdl.write(indent2 + "<iconcolour>" + self.iconColorPause + "</iconcolour>\n")
        outStateHdl.write(indent2 + "<speed>\n")
        outStateHdl.write(indent2 + indent2 + "<alpha>" + str(self.speedAlpha) + "</alpha>\n")                  
        outStateHdl.write(indent2 + indent2 + "<beta>" + str(self.speedBeta) + "</beta>\n")                  
        outStateHdl.write(indent2 + "</speed>\n")
        outStateHdl.write(indent2 + "<headOnFactor>1.0</headOnFactor>\n");
        outStateHdl.write(indent2 + "<agentMode>network</agentMode>\n")
        outStateHdl.write(indent2 + "<type>normal</type>\n")
        outStateHdl.write(indent2 + "<categoricTransitionFunctions>\n")
        outStateHdl.write(indent2 + indent2 + "<function name=\"waitTime\">\n")
        outStateHdl.write(indent2 + indent2 + indent2 +  "<parameter>" + str(self.wayPointWait) + "</parameter>\n")
        outStateHdl.write(indent2 + indent2 + indent2 + "<toState>" + "Exit_" + self.genericAgentId + "_" + str(entryPointID) + "_" + entryPointName + "</toState>\n")                  
        outStateHdl.write(indent2 + indent2 + "</function>\n")
        outStateHdl.write(indent2 + "</categoricTransitionFunctions>\n")
        outStateHdl.write("</stateType>\n\n") 

        ## Exit behaviour 
        outStateHdl.write("<stateType ID=\"" + "exitState_" + self.genericAgentId + "_" + str(entryPointID) + "_" + entryPointName + "\">\n")
        outStateHdl.write(indent2 + "<iconcolour>" + self.iconColorExit + "</iconcolour>\n")
        outStateHdl.write(indent2 + "<speed>\n")
        outStateHdl.write(indent2 + indent2 + "<alpha>" + str(self.speedAlpha) + "</alpha>\n")                  
        outStateHdl.write(indent2 + indent2 + "<beta>" + str(self.speedBeta) + "</beta>\n")                  
        outStateHdl.write(indent2 + "</speed>\n")
        outStateHdl.write(indent2 + "<headOnFactor>1.0</headOnFactor>\n");
        outStateHdl.write(indent2 + "<agentMode>network</agentMode>\n")
        outStateHdl.write(indent2 + "<type>normal</type>\n")
        outStateHdl.write(indent2 + "<instantiationFunctions>\n")
        outStateHdl.write(indent2 + indent2 + "<function name=\"selectWaypoint\">\n")
        outStateHdl.write(indent2 + indent2 + indent2 + "<waypoint id=\"" + str(entryPointID) + "\" probability=\"100\"/>\n")                  
        outStateHdl.write(indent2 + indent2 + "</function>\n")
        outStateHdl.write(indent2 + "</instantiationFunctions>\n")
        outStateHdl.write("</stateType>\n\n") 
Example #41
0
from HyperParameter import HyperParameter
from Normalizer import Normalizer
from AiPolicy import AIPolicy
from Explorator import Explorator
from Trainer import Trainer

# Running the main code


def mkdir(base, name):
    path = os.path.join(base, name)
    if not os.path.exists(path):
        os.makedirs(path)
    return path


workDir = mkdir('exp', 'brs')
monitorDir = mkdir(workDir, 'monitor')

hyperParameter = HyperParameter()
np.random.seed(hyperParameter.seed)
environment = gym.make(hyperParameter.environmentName)
environment = wrappers.Monitor(environment, monitorDir, force=True)
inputsNumber = environment.observation_space.shape[0]
outputsNumber = environment.action_space.shape[0]
policy = AIPolicy(inputsNumber, outputsNumber, hyperParameter)
normalizer = Normalizer(inputsNumber)
explorator = Explorator(hyperParameter, normalizer, policy, environment)
trainer = Trainer(policy, normalizer, hyperParameter, explorator)
trainer.train()
Example #42
0
from NN import NeuralNetwork
from Normalizer import Normalizer
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()
X = cancer['data']
y = cancer['target']
length = len(cancer['feature_names'])

nn = NeuralNetwork([length, 30, 20, 10, 5, 1])

X_train, X_test, y_train, y_test = train_test_split(X, y)

normalize = Normalizer()
normalize.fit(X_train)
X_train = normalize.transform(X_train)
X_test = normalize.transform(X_test)

nn.fit(X_train, y_train, epochs=1000, verbose=False)
predictions = nn.predict(X_test)
print(nn.cost(predictions, y_test))
Example #43
0
 def __init__(self, vectorizer="tfidf", stopwords_path=None):
     self.vectorizer = vectorizer
     self.stopwords_path = stopwords_path
     self.normalizer = Normalizer(self.stopwords_path)
     self.normalizer.load_stopwords()
Example #44
0
    def collect_news(self):
        self.output = self.config["DEFAULT"]["output"]
        query_interval = int(self.config["DEFAULT"]["query_interval"])
        iterations = int(self.config["DEFAULT"]["iterations"])
        iterations_counter = 0

        while iterations_counter <= iterations:
            for section in self.config.sections():
                for option in self.config[section]:
                    url_base = self.config[section]['url_base']

                    if option not in [
                            'url_base', 'query_interval', 'tmp', 'output',
                            'iterations'
                    ]:
                        path = url_base + self.config[section][option]
                        try:
                            xml_data = requests.get(path)
                        except requests.exceptions.ChunkedEncodingError:
                            if self._callback:
                                self._callback("DLERR", path)
                            continue
                        try:
                            tree = ET.ElementTree(
                                ET.fromstring(xml_data.content))
                        except ET.ParseError:
                            if self._callback:
                                self._callback("PARSEERR", path)
                            continue

                        root = tree.getroot()
                        news_list = root.findall('./channel/item')
                        normalized_option = Normalizer.normalize_name(option)

                        self.create_dir_if_not_exists(section)

                        output_xml_file = self.create_output_file_if_not_exists(
                            normalized_option, section)

                        tree_output = ET.parse(output_xml_file)
                        root = tree_output.getroot()

                        for article in news_list:
                            article_title = article.find('title')
                            article_date = article.find('pubDate')

                            try:
                                article_title.text = self.normalize_value(
                                    article_title.text)
                                article_date.text = self.normalize_value(
                                    article_date.text)

                                search_filter = './item[title=' + '"' + article_title.text + '"' + "]" \
                                                + '[pubDate=' + '"' + article_date.text + '"' + ']'

                                all_items = root.findall(search_filter)

                                if len(all_items) == 0:
                                    if self._callback:
                                        self._callback("NEWARTICLE", section,
                                                       option,
                                                       article_title.text)
                                    root.append(article)
                            except AttributeError:
                                if self._callback:
                                    self._callback("BADFORMAT")

                        tree_output.write(output_xml_file)

            if self._callback:
                self._callback("WAITING", query_interval)
                self._callback("CANINTERR")
            time.sleep(query_interval)
            iterations_counter += 1