Python preprocessingの例

コード例 #1

0

ファイルを表示

def main():
    df_train: pd.DataFrame = load_train_data()
    df_eval: pd.DataFrame = load_eval_data()

    df_train = preprocessing(df_train)
    df_eval = preprocessing(df_eval)

    X_train = df_train
    y_train = df_train["score"]

    X_eval = df_eval
    y_eval = df_eval["score"]

    model = Pipeline([("features",
                       ColumnTransformer([
                           ("target_enc",
                            ce.TargetEncoder(cols=["reviewerID", "asin"]),
                            ["reviewerID", "asin"]),
                       ])), ("model", SGDClassifier(loss='modified_huber'))])

    # model = ce.TargetEncoder(cols=["reviewerID"])

    model.fit(X_train, y_train)

    print("Target encoding eval acc:",
          np.mean(model.predict(X_eval) == y_eval))
    print("Target encoding acc:", np.mean(model.predict(X_train) == y_train))

    with open("models/target_encoding.pickle", "wb") as file:
        pickle.dump(model, file)

コード例 #2

0

ファイルを表示

ファイル: pstrip.py プロジェクト: uncbiag/PStrip

def brain_extraction(args):
    print 'Starting pre-processing'
    preprocessing(args)
    print 'Starting Decomposition/Registration'
    main(args)
    print 'Starting post-processing'
    postprocessing(args)

    return

コード例 #3

0

ファイルを表示

ファイル: basicOCR.py プロジェクト: dglinyanov/OpenCV_in_python

	def __getData(self):		
		for i in xrange(self.__classes):
			for j in xrange(self.__train_samples):				
				#Load file TODO: rewrite this piece of shit
				if j < 10:
					_file = self.__file_path + str(i) + "/" + str(i) + "0" + str(j) + ".pbm"
				else:
					_file = self.__file_path + str(i) + "/" + str(i) + str(j) + ".pbm"
				src_image = imread(_file, 0)

				#process file
				prs_image = preprocessing(src_image, self.__size, self.__size)
				
				#Set class label
				row = GetRow(self.__trainClasses, i * self.__train_samples + j)				
				Set(row, i)				
				
				#Set data
				row = GetRow(self.__trainData, i * self.__train_samples + j)				

				img = CreateImage((self.__size, self.__size), IPL_DEPTH_32F, 1 )
				#convert 8 bits image to 32 float image
				ConvertScale(fromarray(prs_image), img, scale=(1.0/255))

				data = GetSubRect(img, (0, 0, self.__size, self.__size))
				
				#convert data matrix sizexsize to vecor
				row1 = Reshape(data, 0, 1)
				Copy(row1, row)

コード例 #4

0

ファイルを表示

ファイル: w2v_d2v_functions.py プロジェクト: zu-ann/IR

def get_paragraphs(files_list, mystem, del_stopwords=False):
    file_text = {}
    data = []

    for i, file in enumerate(files_list):
        if file.endswith('.txt'):
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
                file_text[file] = text
        else:
            text = file
            file = i

        paragraphs = splitter(text, 1)

        for paragraph in paragraphs:
            paragraph_lemmatized = preprocessing(paragraph, del_stopwords)
            data.append({'file': file, 'paragraph': paragraph_lemmatized})

    if file_text:
        with open('file_text', 'w', encoding='utf-8') as fw:
            json.dump(file_text, fw)
        return data, file_text

    else:
        return data

コード例 #5

0

ファイルを表示

ファイル: server.py プロジェクト: Harshitsriv007/DataQuality

def predict():
     # Error checking
      data_csv = request.files['inputFile']
      if not data_csv:
          return "No file"
      DQ = pd.read_csv(data_csv)
      #print(DQ)
      patientids, data = preprocessing(DQ)
      print(type(data))

     # Convert JSON to numpy array
     # predict_request = [data['FIRSTNAME'], data['LASTNAME'], data['GENDERCODE'], data['DATEOFBIRTH'], data['ETHNICITYCODE'], data['RACECODE'], data['MARITALSTATUS']]
      predict_request = np.array(data)
      #predict_request.astype(float)
      print(predict_request)
     # # Predict using the random forest model
      y = xgb_model.predict(predict_request)
      y = np.array(y).reshape((y.shape[0],1))
      patientids = np.array(patientids).reshape((patientids.shape[0],1))
      print(y)
      print(patientids.shape)
      y = pd.DataFrame(y)
      patientids = pd.DataFrame(patientids)
     # Return prediction
      #output = y
      #d = {'col1': PATIENTID , 'col2': PREDICTIONS}
      #result = pd.DataFrame(data=d, index = index)
      results = pd.concat([patientids.reset_index(drop = True), y], axis=1)
      results.columns = ['Patient_id', 'Predictions']
      #results = pd.DataFrame.to_json(results)
      #print(result)
      return render_template("score.html", score=results.to_html())

コード例 #6

0

ファイルを表示

 def trainingModel(self):
     self.registerWorking.finishThread.emit()
     state = 0
     while True:
         if state == 0:
             # Pre-process
             obj = preprocessing(self.input_datadir, self.output_datadir)
             nrof_images_total, nrof_successfully_aligned = obj.alignProcessing(
             )
             print('Total number of images: %d' % nrof_images_total)
             print('Number of successfully aligned images: %d' %
                   nrof_successfully_aligned)
             state += 1
             # Classifier
         elif state == 1:
             print("Training Start")
             objModel = classifier(
                 mode='TRAIN',
                 datadir=self.datadir,
                 modeldir=self.modeldir,
                 classifierFilename=self.classifier_filename)
             get_file = objModel.main()
             sys.exit("All Done")
             state += 1
         else:
             break

コード例 #7

0

ファイルを表示

 def __getitem__(self, item):
     data = preprocessing(
         self.dataset[item][0], 
         self.dataset[item][1], 
         self.dataset[item][2], 
         self.dataset[item][3], 
         self.dataset[item][4], 
         self.dataset[item][5]
     )
     temp = []
     for i in data["targets_class"]:
         temp.append(torch.tensor(i, dtype=torch.long))
         
     # Return the processed data where the lists are converted to `torch.tensor`s
     return {
         'ids': torch.tensor(data["ids"], dtype=torch.long),
         'mask': torch.tensor(data["mask"], dtype=torch.long),
         'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long),
         'targets_start': torch.tensor(data["targets_start"], dtype=torch.long),
         'targets_end': torch.tensor(data["targets_end"], dtype=torch.long),
         'targets_class': torch.tensor(data["targets_class"], dtype=torch.long),
         'orig_tweet': data["orig_text"],
         'orig_selected': data["orig_keyword"],
         'sentiment': data["class"],
         'offsets': torch.tensor(data["offsets"], dtype=torch.long),
         'error_index': torch.tensor(data["error_index"], dtype=torch.long)
     }

コード例 #8

0

ファイルを表示

ファイル: random_forest.py プロジェクト: lukaszeckert/EMD-py

def main():
    for column in ["summary", "reviewText"]:
        df_train: pd.DataFrame = load_train_data()
        df_eval: pd.DataFrame = load_eval_data()

        df_train = df_train[:50000]
        df_train = preprocessing(df_train)
        df_eval = preprocessing(df_eval)
        X_train = df_train
        y_train = df_train["score"]

        X_eval = df_eval
        y_eval = df_eval["score"]

        model = Pipeline(
            [
                ("tfidf", ColumnTransformer([
                    ("tfidf",
                     TfidfVectorizer(min_df=10, max_df=0.3,preprocessor=preprocess, tokenizer=tokenize,ngram_range=(1,2), stop_words='english'),
                     column)

                ])),
                ("SVD", TruncatedSVD(n_components=500)),
                ("forest", RandomForestClassifier(n_estimators=10, random_state=0, max_depth=4))
            ]
        )
        param_grid = {
            'SVD__n_components': [500],
            'forest__n_estimators': [100, 500, 1000],
            'forest__max_depth': [4,8]
        }

       # print(model.fit_transform(X_train[:100], y_train[:100]))
        tscv = TimeSeriesSplit(n_splits=2)
        search = GridSearchCV(model, param_grid, n_jobs=12, cv=tscv.split(X_train), verbose=True)
        search.fit(X_train, y_train)
        print("Best parameter (CV score=%0.3f):" % search.best_score_)
        print(search.best_params_)
        model = search.best_estimator_
        print(model[1].explained_variance_ratio_)


        print(f"Tfidf random forest column {column} eval acc:", np.mean(model.predict(X_eval) == y_eval))
        print(f"Tfidf random forest  column {column} train acc:", np.mean(model.predict(X_train) == y_train))

        with open(f"models/random_forest_{column}.pickle", "wb") as file:
            pickle.dump(model, file)

コード例 #9

0

ファイルを表示

    def preprocessing(self, ponct=0, spell=0, predict=0, stop=0, lem=0):
        tokens = []
        for i in range(len(self.dyads_index)):
            index = self.dyads_index[i]
            tokens = tokens + preprocessing(self.utterances[index],
                                            self.labels[index], ponct, spell,
                                            predict, stop, lem)

        self.tokens = tokens

コード例 #10

0

ファイルを表示

ファイル: combine_classifiers-checkpoint.py プロジェクト: eboursier/MVA-NBA-Challenge

class CombineClassifiers():

	def __init__(self, classifiers, nsamples=24, global_stats=False):
		"""
		We initialize with a list of tuples where the first element is a classifier and the second is the name of the classifier
		Whereas nsamples is a constant or a list. If a list, each element corresponds to the number of samples for the corresponding classifier

		For a 
		"""

		self.classifiers = {}
		self.nsamples = {}
		self.nclassif = 0
		self.global_stats = global_stats

		for classifier, name in classifiers:
			self.classifiers[name] = classifier
			if type(nsamples)==int:
				self.nsamples[name] = nsamples
			elif len(nsamples)==len(classifiers):
				self.nsamples[name] = nsamples[self.nclassif]
			else:
				raise ValueError('nsamples has to be either an int or a list of the same length than classifiers')

			self.nclassif += 1
		
		
	def fit(self, X, y):
		"""
		provide X and y in the form of X, y = load_train() here
		fit all the classifiers individually here
		"""
		for k in self.classifiers.keys():
			X_train, y_train = preprocessing(X, y, sampling_rate=1440//self.nsamples[k])

			if 'RNN' in k:
				X_train = X_train.reshape(X_train.shape[0], -1, self.nsamples[k])

			self.classifiers[k].fit(X_train, y_train)


	def predict(self, X):
		"""
		predict the classes for X
		"""
        
        preds = np.zeros((self.nclassif, X.shape[0]))
        
        i = 0
		for k in self.classifiers.keys():
			X_test, _ = preprocessing(X, _, sampling_rate=1440//self.nsamples[k])

			if 'RNN' in k:
				X_test = X_test.reshape(X_test.shape[0], -1, self.nsamples[k])

			preds[i] = self.classifiers[k].predict_proba(X_test)

コード例 #11

0

ファイルを表示

ファイル: test.py プロジェクト: hcpmiyuki/hukuzakuna_keisan

def test(line):
    line = preprocessing(line)
    tokens = tokenize(line)
    preferentially_evaluated_token = prioritizeParentheses(tokens)
    actualAnswer = evaluateAll(preferentially_evaluated_token)
    expectedAnswer = eval(line)
    if abs(actualAnswer - expectedAnswer) < 1e-8:
        print("PASS! (%s = %f)" % (line, expectedAnswer))
    else:
        print("FAIL! (%s should be %f but was %f)" % (line, expectedAnswer, actualAnswer))

コード例 #12

0

ファイルを表示

ファイル: app_inverted_index.py プロジェクト: zu-ann/IR

def search(query, inverted_index, data, document_length, n_results=5):

    query = preprocessing(query, del_stopwords=False)

    search_result = get_search_result(query, inverted_index, data['corpus'],
                                      document_length, n_results)

    results = [(data.loc[index, 'url'], data.loc[index, 'corpus'])
               for index in search_result]
    return results

コード例 #13

0

ファイルを表示

ファイル: predictSign.py プロジェクト: krishnaarjun/ropesi

 def __init__(self, size, makeData, noComp):
     self.pca = eigenHands(size)
     self.gabor = gaborFilters(False, size)
     self.classify = classifyHands(False, size)
     self.prep = preprocessing(size, noComp)
     if (makeData == True):
         self.pca.makeMatrix("garb")
         self.pca.makeMatrix("hands")
         self.pca.makeMatrix("rock")
         self.pca.makeMatrix("paper")
         self.pca.makeMatrix("scissors")

コード例 #14

0

ファイルを表示

ファイル: predictSign.py プロジェクト: krishnaarjun/ropesi

	def __init__(self, size, makeData, noComp):
		self.pca      = eigenHands(size)
		self.gabor    = gaborFilters(False, size)
		self.classify = classifyHands(False, size)
		self.prep     = preprocessing(size, noComp)
		if(makeData == True):
	    		self.pca.makeMatrix("garb")
	    		self.pca.makeMatrix("hands")
	    		self.pca.makeMatrix("rock")
	    		self.pca.makeMatrix("paper")
	    		self.pca.makeMatrix("scissors")

コード例 #15

0

ファイルを表示

ファイル: combine_classifiers-checkpoint.py プロジェクト: eboursier/MVA-NBA-Challenge

	def fit(self, X, y):
		"""
		provide X and y in the form of X, y = load_train() here
		fit all the classifiers individually here
		"""
		for k in self.classifiers.keys():
			X_train, y_train = preprocessing(X, y, sampling_rate=1440//self.nsamples[k])

			if 'RNN' in k:
				X_train = X_train.reshape(X_train.shape[0], -1, self.nsamples[k])

			self.classifiers[k].fit(X_train, y_train)

コード例 #16

0

ファイルを表示

ファイル: main.py プロジェクト: hcpmiyuki/hukuzakuna_keisan

def top():
    # 全角を半角に、空白を削除する
    line = preprocessing(request.args.get('line', ''))

    # 不正な入力（記号の連続、数字と記号以外の入力等）を防ぐ
    if validator(line) == True:
        tokens = tokenize(line)
        preferentially_evaluated_token = prioritizeParentheses(tokens)
        answer = evaluateAll(preferentially_evaluated_token)
    else:
        answer = validator(line)
    return render_template('top.html', ans=answer)

コード例 #17

0

ファイルを表示

def main():
    df_train: pd.DataFrame = load_train_data()
    df_eval: pd.DataFrame = load_eval_data()

    df_train = preprocessing(df_train)
    df_eval = preprocessing(df_eval)

    X_train = [0] * len(df_train)
    y_train = df_train["score"]

    X_eval = [0] * len(df_eval)
    y_eval = df_eval["score"]

    model = DummyClassifier("most_frequent")
    model.fit(X_train, y_train)

    print("Baseline eval acc:", np.mean(model.predict(X_eval) == y_eval))
    print("Baseline train acc:", np.mean(model.predict(X_train) == y_train))

    with open("models/baseline.pickle", "wb") as file:
        pickle.dump(model, file)

コード例 #18

0

ファイルを表示

def search(query, search_method, inverted_index, data, 
           document_length, w2v_model, w2v_base,
           d2v_model, d2v_base, n_results=5):

    query = preprocessing(query, del_stopwords=False)
    
    if search_method == ['inverted_index', 'word2vec', 'doc2vec']:
        res_inv_ind = get_search_result(query, inverted_index, data['corpus'], document_length, n_results * 50,
                                        return_sim=True)
        res_w2v = search_w2v(query, w2v_model, w2v_base, n_results * 50, return_sim=True)
        res_d2v = search_d2v(query, d2v_model, d2v_base, n_results * 50, return_sim=True)
        combination = res_inv_ind + res_w2v + res_d2v
        
        search_result = [index for index, _ in sorted(combination, key=lambda x: x[1], reverse=True)[:n_results]]
    
    elif search_method == ['inverted_index', 'word2vec']:
        res_inv_ind = get_search_result(query, inverted_index, data['corpus'], document_length, n_results * 50,
                                        return_sim=True)
        res_w2v = search_w2v(query, w2v_model, w2v_base, n_results * 50, return_sim=True)
        combination = res_inv_ind + res_w2v
        
        search_result = [index for index, _ in sorted(combination, key=lambda x: x[1], reverse=True)[:n_results]]
        
    elif search_method == ['inverted_index', 'doc2vec']:
        res_inv_ind = get_search_result(query, inverted_index, data['corpus'], document_length, n_results * 50,
                                        return_sim=True)
        res_d2v = search_d2v(query, d2v_model, d2v_base, n_results * 50, return_sim=True)
        combination = res_inv_ind + res_d2v
        
        search_result = [index for index, _ in sorted(combination, key=lambda x: x[1], reverse=True)[:n_results]]
        
    elif search_method == ['word2vec', 'doc2vec']:
        res_w2v = search_w2v(query, w2v_model, w2v_base, n_results * 50, return_sim=True)
        res_d2v = search_d2v(query, d2v_model, d2v_base, n_results * 50, return_sim=True)
        combination = res_w2v + res_d2v
        
        search_result = [index for index, _ in sorted(combination, key=lambda x: x[1], reverse=True)[:n_results]]

    elif search_method == ['inverted_index']:
        search_result = get_search_result(query, inverted_index, data['corpus'], document_length, n_results)
    
    elif search_method == ['word2vec']:
        search_result = search_w2v(query, w2v_model, w2v_base, n_results)
    
    elif search_method == ['doc2vec']:
        search_result = search_d2v(query, d2v_model, d2v_base, n_results)
    
    else:
        raise TypeError('unsupported search method')
    
    results = [(data.loc[index, 'url'], data.loc[index, 'corpus']) for index in search_result]
    return results

コード例 #19

0

ファイルを表示

ファイル: main.py プロジェクト: Mahboub99/Orchestra

def process_image(inputfolder, fn, f):
    cutted, ref_lines, lines_spacing = preprocessing(inputfolder, fn, f)

    last_acc = ''
    last_num = ''
    height_before = 0

    if len(cutted) > 1:
        f.write('{\n')

    for it in range(len(cutted)):
        f.write('[')
        is_started = False

        symbols_boundaries = segmentation(height_before, cutted[it])
        symbols_boundaries.sort(key=lambda x: (x[0], x[1]))

        for boundary in symbols_boundaries:
            label, cutted_boundaries = get_label_cutted_boundaries(
                boundary, height_before, cutted[it])

            if label == 'clef':
                is_started = True

            for cutted_boundary in cutted_boundaries:
                _, y1, _, y2 = cutted_boundary
                if is_started == True and label != 'barline' and label != 'clef':
                    text = text_operation(label, ref_lines[it],
                                          lines_spacing[it], y1, y2)

                    if (label == 't_2' or label == 't_4') and last_num == '':
                        last_num = text
                    elif label in accidentals:
                        last_acc = text
                    else:
                        if last_acc != '':
                            text = text[0] + last_acc + text[1:]
                            last_acc = ''

                        if last_num != '':
                            text = f'\meter<"{text}/{last_num}">'
                            last_num = ''

                        not_dot = label != 'dot'
                        f.write(not_dot * ' ' + text)

        height_before += cutted[it].shape[0]
        f.write(' ]\n')

    if len(cutted) > 1:
        f.write('}')

コード例 #20

0

ファイルを表示

def search_w2v(query, model, w2v_base, n_results):
    query_vec = get_w2v_vectors(model, preprocessing(query))

    similarities = {}

    for doc in w2v_base:
        sim = similarity(query_vec, doc['vec'])
        # print(query_vec)
        similarities[sim] = doc['index']

    results = [
        re.split('/Friends - season [0-9]/Friends - ',
                 similarities[sim].strip('.ru.txt'))[1]
        for sim in sorted(similarities, reverse=True)[:n_results]
    ]

    return results

コード例 #21

0

ファイルを表示

ファイル: basicOCR.py プロジェクト: dglinyanov/OpenCV_in_python

	def test(self):
		error = 0
		testCount = 0
		for  i in xrange(self.__classes):
			for j in xrange(50, 50 + self.__train_samples):
				_file = self.__file_path + str(i) + "/" + str(i) + str(j) + ".pbm"
				src_image = imread(_file, 0)				

				#process file
				prs_image = preprocessing(src_image, self.__size, self.__size)
				prs_np = prs_image
				r = self.classify(prs_np, 0)				
				if int(r) != i:
					error += 1
				
				testCount += 1

		totalerror = 100 * error / float(testCount)
		print "System Error: " + str(totalerror)

コード例 #22

0

ファイルを表示

def prediction(x, clf, multilabel_binarizer, vec):

    processed_text = preprocessing(x)

    data = vec.transform([processed_text])

    ops = clf.predict(data)

    labels = multilabel_binarizer.inverse_transform(ops)

    ops_prob = clf.predict_proba(data) * ops

    labels_prob = multilabel_binarizer.classes_[ops_prob[0] > 0]

    ops_list = ops_prob[ops_prob != 0]

    cat = decode_cat(labels)
    '''
    
    return data -> {categories:[...],root_cause:[.....],proba:[......]}

    '''

    if len(cat[0]) > 0:

        categories = ""
        for i in cat[0][:]:
            categories += i + ","

        ops_list *= 100
        data = {
            'cat': [categories],
            'root_causes': labels,
            'proba': ops_list.tolist()
        }

        return data

        # for i,j in enumerate(labels[0]):
        #     st.write("### "+j.strip() + 'proba'ops_list[i] * 100)
    else:
        return "no root cause deteced please enter valid input"

コード例 #23

0

ファイルを表示

 def __init__(self, data_path = './data_bci', train = True, one_khz = False, filter = False, robust_scaler = False, 
             num_samples = 20, shift = 10, force_cpu = False):
     # Load data
     self.input, self.target =  dlc_bci.load(root = data_path, one_khz = one_khz, train = train)
     self.train = train
     self.force_cpu = force_cpu
     
     print('Input data loaded (size = {})'.format(self.input.shape))
     print('Target data loaded (size = {})'.format(self.target.shape))
     
     #Filtering
     if filter:
         if one_khz:
             fs = 1000
         else:
             fs = 100
         self.input = preprocessing(self.input, ignore_outliers = robust_scaler, fs = fs)
         
         if torch.cuda.is_available() and not force_cpu:
             self.input = self.input.cuda()

コード例 #24

0

ファイルを表示

ファイル: test.py プロジェクト: zoomina/MZ_CEC

def test_main(checkpoint,
              data,
              batch_size,
              num_workers,
              num_classes,
              inner_emotion=-1,
              test=False):
    max_len = 64
    model = BERTClassifier(num_classes=num_classes).build_model()
    model.load_state_dict(checkpoint)
    device = torch.device(
        "cuda:0") if torch.cuda.is_available() else torch.device("cpu")
    eval_dtls = preprocessing(json2csv(data, test=test),
                              inner_emotion=inner_emotion,
                              test=test)

    data_test = test_loader(eval_dtls, max_len, batch_size, num_workers)

    result_df = test(data_test, model, device)

    return result_df

コード例 #25

0

ファイルを表示

ファイル: app.py プロジェクト: madhavambati/Convolutional-Neural-Network-with-Numpy

def digit_process():
    if (request.method == "POST"):
        img = request.get_json()
        img = preprocessing(img)

        save_path = 'model/params.pkl'
        params, cost = pickle.load(open(save_path, 'rb'))

        [f1, f2, w3, w4, b1, b2, b3, b4] = params
        digit, probability = predict(img, params)
        #print(digit, "%0.2f"%probability)

        #l = int(digit)
        #p = float(probability)

        data = {
            "digit": int(digit),
            "probability": float(np.round(probability, 3))
        }
        data_json = json.dumps(data)

        return jsonify(data_json)
        print(done)

コード例 #26

0

ファイルを表示

ファイル: w2v_d2v_functions.py プロジェクト: zu-ann/IR

def save_w2v_base(files_list, model, mystem, save=True, title='w2v_base'):
    """Индексирует всю базу для поиска через word2vec"""
    documents_info = []

    for i, file in tqdm_notebook(enumerate(files_list)):
        if file.endswith('.txt'):
            with open(file, 'r', encoding='utf-8') as f:
                text = f.read()
        else:
            text = file
            file = i

        lemmas = preprocessing(text)
        vec = get_w2v_vectors(model, lemmas)

        file_info = {'file': file, 'word2vec': vec}
        documents_info.append(file_info)

    if save:
        with open(title + '.pkl', 'wb') as fw:
            pickle.dump(documents_info, fw)

    return documents_info

コード例 #27

0

ファイルを表示

def search_inv_index(query, inverted_index, term_doc_matrix, files_length,
                     n_results) -> list:
    """
    Compute sim score between search query and all documents in collection
    :param query: input text
    :return: list of doc_ids
    """

    relevance_dict = defaultdict(float)
    lemmas = preprocessing(query)

    for lemma in lemmas:
        sims = compute_sim(lemma, inverted_index, term_doc_matrix,
                           files_length)
        for doc in sims:
            relevance_dict[doc] += sims[doc]

    result = sorted(relevance_dict, key=relevance_dict.get,
                    reverse=True)[:n_results]

    return [
        re.split('/Friends - season [0-9]/Friends - ',
                 files_list[doc].strip('.ru.txt'))[1] for doc in result
    ]

コード例 #28

0

ファイルを表示

ファイル: basicOCR.py プロジェクト: dglinyanov/OpenCV_in_python

	def classify(self, image_source, show_result):		
		nearest = CreateMat(1, self.__K,CV_32FC1)
		#process file	

		prs_image = preprocessing(image_source, self.__size, self.__size)
		
		#Set data 
		img32 = CreateImage((self.__size, self.__size), IPL_DEPTH_32F, 1)
		ConvertScale(fromarray(prs_image), img32, scale=(1.0/255))		
		data = GetSubRect(img32, (0, 0, self.__size, self.__size))
		row1 = Reshape(data, 0, 1)
		row1np = np.array(row1)
		retval, result, nearest, dists = self.__knn.find_nearest(row1np, self.__K)		
		
		accuracy = 0
		for i in xrange(self.__K):
			if nearest[0][i] == result[0][0]:
	                    accuracy += 1
		pre = 100 * accuracy / float(self.__K)
		if show_result == 1:
			print "|\t" + str(result[0][0]) + "\t| \t" + str(pre) + "  \t| \t" + str(accuracy) + " of " + str(self.__K) + " \t|"
			print " ---------------------------------------------------------------"

		return result

コード例 #29

0

ファイルを表示

	tweet_set[len(tweet_set) - 1].tweetId = tweet['_id']
	dic_user[userId].addTweet(tweet_set[len(tweet_set) - 1])

	c += 1
	if c == num_tweets:
		break

#for userId, user in dic_user.iteritems():
#	print(str(userId) + " " + str(len(user.tweet_set)))

k_topics = num_topics
LDA_iterations = num_iterations
sentimentPoints = getSentimentPoints()
#print(sentimentPoints)

dictionary, corpus, out_set = preprocessing(doc_set)

for i in range(0,len(out_set)):
	tweet_set[i].wordSet = out_set[i]

sentimentsOfTweets = getSentimentScoreOfTweets(out_set)
model = LDA(dictionary, corpus, k_topics, LDA_iterations)

for i in range(0,len(sentimentsOfTweets)):
	tweet_set[i].russell_tuple = sentimentsOfTweets[i]

sentDic = loadDict()

dictByTopic = []
tempDic = {}
topics = model.get_topics()

コード例 #30

0

ファイルを表示

ファイル: cross_val_score.py.py プロジェクト: saurabhmishraiitg/Run-Machine-Learning-Experiments-with-Python-Logging-module

# import important packages
from imports import *
from logs import log
from preprocessing import *

# set a logger file
logger = log(path="logs/", file="cross_val.logs")

#load dataset
data = pd.read_csv("data/loans_data.csv")

# preprocessing the loan data
data = preprocessing(data)

# split data into train and test
X = data.drop('Loan_Status', axis=1)
y = data.Loan_Status

# create a dictionary for  classifiers
models = {
    "KNN": KNeighborsClassifier(),
    "RF": RandomForestClassifier(),
    "GB": GradientBoostingClassifier(),
    "DTC": DecisionTreeClassifier(),
    "BC": BaggingClassifier(),
    "XGB": XGBClassifier(),
    "EXT": ExtraTreesClassifier(),
    "LG": LogisticRegression(),
    "BBC": BalancedBaggingClassifier(),
    "EEC": EasyEnsembleClassifier(),
}

コード例 #31

0

ファイルを表示

ファイル: template_constraints_generator.py プロジェクト: prashasthip/TemplateConstraintGenerator

def register(m1,m2,m3=""):
        if(m3!=""):
                
                if(m1 not in template_constraints):
                        template_constraints[m1]=set()
                template_constraints[m1].add(m3)
        else:
                
                if(m1 not in template_constraints):
                        template_constraints[m1]=set()
                template_constraints[m1].add(m2)


filename=input("Enter file name\n")
input_expressions = preprocessing(filename)


datatype_pattern=r'int | double' 
for datatype in temp_datatype_table:
        datatype_pattern+=' | '+datatype
        
tokens = ('OB','CB','ID','DT','INT','DBL','COMMA','SC','EQ','COUT','CIN','RETURN',
'EXT','INS','ASSN_OP','OR','AND','S_AND','EQUAL','N_EQUAL','REL_OP',
'INCR','DECR','SO','MUL','DIV','PLUS','MINUS','NOT','MOD','OC','CONST')

@lex.TOKEN(datatype_pattern)
def t_DT(t):
        return t
def t_COMMA(t):
        r','

コード例 #32

0

ファイルを表示

                yhat_e,
                label='(' + str(rmse_e.round(decimals=2)) +
                ') Bayesian Linear Regression (r/b)')

    pyplot.xlabel('Sample Index')
    pyplot.ylabel('Values')
    pyplot.title('Comparison of Linear Regression Model answer to 1f')
    pyplot.legend(loc="best")


if __name__ == '__main__':
    #partition it to 80% trainingset and 20%testset#total parameter=24
    datasets = pd.read_csv('train.csv')
    # Set seed so we get same random allocation on each run of code
    #np.random.seed(7)
    preprocessed = preprocessing(datasets)
    train_set_x, train_set_y, test_set_x, test_set_y = train_test_split_preprocessed(
        preprocessed)

    #solution1: Normal Linear Regression (1b)
    rmse_b, yhat_b = one_b.linear_regression_1b(train_set_x, train_set_y,
                                                test_set_x, test_set_y)

    #solution2: Regularized Linear Regression (1c)
    rmse_c, yhat_c = one_c.Regularized_LinearRegression_1c(
        train_set_x, train_set_y, test_set_x, test_set_y)

    #solution3: Regularized Linear Regression with bias term (1d)
    rmse_d, yhat_d = one_d.Regularized_Biased_LinearRegression_1d(
        train_set_x, train_set_y, test_set_x, test_set_y)

コード例 #33

0

ファイルを表示

ファイル: LDA.py プロジェクト: bramrodenburg/nlp2015

        dir_path = sys.argv[2]

    #inFile = dir_path + "dvd.xml"
    #inFile = dir_path + "dvdReviews.xml"
    # inFile = dir_path + "example.xml"
    inFile = dir_path + "all.review"
    pickelfile = dir_path + "dvd_reviews_limited.pkl"
    h5_file = dir_path + "data.h5"
    # pickelfile = dir_path + "example.pkl"
    # pickelfile = dir_path + "dvd_reviews.pkl"
    mem_file_results = dir_path + "lda_results.h5"

    # inFile = sys.argv[2] + "dvd.xml" huge file

    if preprocess == 'True':
        reviews, w, doc_words = preprocessing(inFile)
        print "Save objects to file %s" % pickelfile
        start = timer()
        with open(pickelfile, 'wb') as f:
            pickle.dump(reviews, f)
            pickle.dump(w, f)
            print "Number of reviews : %d" % len(reviews)
            print "# of words in bag %s %s" % doc_words.shape
            #pickle.dump(doc_words, f)
        end = timer()
        h5f = h5py.File(h5_file, 'w')
        h5f.create_dataset('doc_words', data=doc_words)
        h5f.close()
        print "Saved objects to file in %s seconds." % (end - start)
    else:
        with open(pickelfile, 'rb') as f:

コード例 #34

0

ファイルを表示

ファイル: MGLDA.py プロジェクト: wusix2011/nlp2015

    if len(sys.argv) == 1:
        preprocess = "True"
        # dir_path = 'F:/temp/topics/D - data/movie/test/'
        product = "electronics"
        dir_path = 'S:/Workspace/data/sports/'
    else:
        preprocess = sys.argv[1]
        dir_path = sys.argv[2]

    mem_file_results = dir_path + "mglda_" + product + "_" + str(
        N_GIBBS_SAMPLING_ITERATIONS) + ".mem"

    if preprocess == 'True':
        inFile = dir_path + "all.review.xml"
        reviews, d_vocab, l_bag_of_words, m_doc_words, m_docs_sentence_words = preprocessing(
            inFile)
    else:
        reviews, d_vocab, l_bag_of_words, m_doc_words, m_docs_sentence_words = load_objects(
            dir_path, product)

    # check_doc_word_matrix(doc_words, reviews, w)
    # last parameter is the max number of sentences for corpus
    doc_sentence_count, max_number_s = count_sent_docs(reviews)
    # create LDAModel object and initialize counters for Gibbs sampling
    lda = LDAModel(l_bag_of_words, m_docs_sentence_words, doc_sentence_count,
                   max_number_s, K_GL, K_LOC, 0.005, 0.005, 0.005, 0.005,
                   0.005, 0.005, 0.005, dir_path)
    # initialize counters
    start = timer()
    print "LDA initialize..."
    lda.initialize()

コード例 #35

0

ファイルを表示

ファイル: MGLDA.py プロジェクト: bramrodenburg/nlp2015

    global reviews

    if len(sys.argv) == 1:
        preprocess = "True"
        # dir_path = 'F:/temp/topics/D - data/movie/test/'
        product = "electronics"
        dir_path = 'S:/Workspace/data/sports/'
    else:
        preprocess = sys.argv[1]
        dir_path = sys.argv[2]

    mem_file_results = dir_path + "mglda_" + product + "_" + str(N_GIBBS_SAMPLING_ITERATIONS) + ".mem"

    if preprocess == 'True':
        inFile = dir_path +  "all.review.xml"
        reviews, d_vocab, l_bag_of_words, m_doc_words, m_docs_sentence_words = preprocessing(inFile)
    else:
        reviews, d_vocab, l_bag_of_words, m_doc_words, m_docs_sentence_words = load_objects(dir_path, product)

    # check_doc_word_matrix(doc_words, reviews, w)
    # last parameter is the max number of sentences for corpus
    doc_sentence_count, max_number_s = count_sent_docs(reviews)
    # create LDAModel object and initialize counters for Gibbs sampling
    lda = LDAModel(l_bag_of_words, m_docs_sentence_words, doc_sentence_count, max_number_s, K_GL, K_LOC, 0.005, 0.005, 0.005, 0.005,
                   0.005, 0.005, 0.005, dir_path)
    # initialize counters
    start = timer()
    print "LDA initialize..."
    lda.initialize()
    # lda.print_counts()
    end = timer()

コード例 #36

0

ファイルを表示

        dir_path = sys.argv[2]

    #inFile = dir_path + "dvd.xml"
    #inFile = dir_path + "dvdReviews.xml"
    # inFile = dir_path + "example.xml"
    inFile = dir_path + "all.review"
    pickelfile = dir_path + "dvd_reviews_limited.pkl"
    h5_file = dir_path + "data.h5"
    # pickelfile = dir_path + "example.pkl"
    # pickelfile = dir_path + "dvd_reviews.pkl"
    mem_file_results = dir_path + "lda_results.h5"

    # inFile = sys.argv[2] + "dvd.xml" huge file

    if preprocess == 'True':
        reviews, w, doc_words = preprocessing(inFile)
        print "Save objects to file %s" % pickelfile
        start = timer()
        with open(pickelfile, 'wb') as f:
            pickle.dump(reviews, f)
            pickle.dump(w, f)
            print "Number of reviews : %d" % len(reviews)
            print "# of words in bag %s %s" % doc_words.shape
            #pickle.dump(doc_words, f)
        end = timer()
        h5f = h5py.File(h5_file, 'w')
        h5f.create_dataset('doc_words', data=doc_words)
        h5f.close()
        print "Saved objects to file in %s seconds." % (end - start)
    else:
        with open(pickelfile, 'rb') as f:

コード例 #37

0

ファイルを表示

ファイル: startMenu.py プロジェクト: krishnaarjun/ropesi

		projData = hands.justGetDataMat(datas[dataset][i],"",True)
		hands.projPCA(projData, False, "PCA/", datas[dataset][i])
#____________________________________________________________________________________________________

elif(int(choice) == 2):
	dataset = raw_input('choose the dataset (r/p/s) ...')
	datas   = {'r':'rock', 'p':'paper', 's':'scissors'} 
	gabor   = gaborFilters(buildOpt[str(build)],int(sizeImg))
	gabor.setParameters(0.4, 0.8, 20, (numpy.pi*3.0/4.0), 5.0, 4.0)
	data    = cv.Load("data_train/"+datas[dataset]+"Train"+str(sizeImg)+".dat")
	gabor.convolveImg(data, True)
#____________________________________________________________________________________________________

elif(int(choice) == 3):
	aNumber = raw_input('write an unused nr/word ...')  
	prep    = preprocessing(int(sizeImg),0,0)
	prep.getHandsVideo(aNumber)
#____________________________________________________________________________________________________

elif(int(choice) == 4):
	noComp = raw_input('number of components for PCA no ...')
	dataset = raw_input('choose the dataset c= > rock & paper & scissors; h => hands vs garbage ...')   
	datas   = {'c':['rock','paper','scissors'], 'h':['hands','garb']} 
	hands   = eigenHands(int(sizeImg))
	_,data,txtLabels = hands.justGetDataMat(datas[dataset][0],"",False)
	prep    = preprocessing(int(sizeImg),int(noComp))
	prep.doManyGabors(data,txtLabels,dataset, False)
#____________________________________________________________________________________________________

elif(int(choice) == 5):
	noComp  = raw_input('number of components for PCA ...')

コード例 #38

0

ファイルを表示

    print(len(example))

    df = pd.DataFrame({'example': example})
    df['example'] = [word_tokenize(entry) for entry in df['example']]
    stopWords = set(stopwords.words('French'))
    stopWords_ang = set(stopwords.words('English'))
    l = ["-", "d", "co", "si"]
    modification(df, "example", stopWords_ang)
    modification(df, "example", stopWords)
    modification(df, "example", l)
    #stemming_frensh

    stem_list = stem(df, "example")

    preprocessing(df, 'example', stem_list)
    example = df.values.tolist()

    flat_list = lambda l: [item for sublist in l for item in sublist]
    example = [i.lstrip() for i in df['example']]

    #Model
    dtm_lsa = LSA(example)

    df = pd.read_excel(path + "\input_file.xlsx")
    #example_des_b=df.reset_index()['Description'].values.tolist()

    #Label des Données brutes
    df[col2] = df[col2][(~df[col2].duplicated()) | df[col2].isna()]
    #df to list
    example_des_b = df.reset_index()[col1].values.tolist()