Exemple #1
0
def TestMultinomialNB(class_name_list, test_file_path, vocabulary_dict,
                      prior_dict, cp_dict):
    positive = 0
    negative = 0
    for class_name in class_name_list:
        pos = 0
        neg = 0
        class_index = class_name_list.index(class_name)
        folder_path = join(test_file_path, class_name)
        doc_list = PreProcessing.getFileNames(folder_path)
        for doc in doc_list:
            filepath = join(folder_path, doc)
            filtered_word_list = PreProcessing.readData(filepath)
            index = applyMultinomialNB(vocabulary_dict, prior_dict, cp_dict,
                                       filtered_word_list)

            if (index == class_index):
                pos += 1
            else:
                neg += 1

        print(
            "Class: {0}, Total docs:{1} => Positive:{2}, Negative:{3}".format(
                class_name, len(doc_list), pos, neg))
        positive = positive + pos
        negative = negative + neg

    return (positive, negative)
Exemple #2
0
def GenerateVocabularyData(stemming, vocabulary_path, train_data_path):
    with open(train_data_path) as tdf:
        train_data = json.load(tdf)

    vocabulary_dict = {}
    unigram_set = set()
    bigram_set = set()
    index = 0

    TotalPositiveWordCount = 0
    TotalNegativeWordCount = 0

    for key in train_data:
        for value in train_data.get(key):
            pp_value = pp.StopWordAndSpecialCharRemoval(value, stemming)
            if stemming == True:
                bigrams = [
                    b for b in zip(
                        pp.perform_stemming(re.split('\s+', pp_value)[:-1]),
                        pp.perform_stemming((re.split('\s+', pp_value)[1:])))
                ]
                unigrams = [ps.stem(word) for word in pp_value]
            else:
                bigrams = [
                    b for b in zip(
                        re.split('\s+', pp_value)[:-1],
                        re.split('\s+', pp_value)[1:])
                ]
                unigrams = [u for u in re.split('\s+', pp_value)]

            temp_bigram_set = set()
            for bigram in bigrams:
                temp_bigram_set.add(bigram)
            bigram_set = bigram_set | temp_bigram_set  #to append sets

            for unigram in unigrams:
                unigram_set.add(unigram)
            if key == 'positive':
                TotalPositiveWordCount = len(set(unigrams)) + len(
                    temp_bigram_set) + TotalPositiveWordCount
            else:
                TotalNegativeWordCount = len(set(unigrams)) + len(
                    temp_bigram_set) + TotalNegativeWordCount
    vocabulary_dict.update({'unigram': list(unigram_set)})
    vocabulary_dict.update({'bigram': list(bigram_set)})
    if os.path.exists(vocabulary_path):
        os.remove(vocabulary_path)
    with open(vocabulary_path, 'w') as outfile:
        json.dump(vocabulary_dict, outfile, sort_keys=True, indent=4)
    return {
        'PositiveWordCount': TotalPositiveWordCount,
        'NegativeWordCount': TotalNegativeWordCount
    }
def build_handler():

    try:
        #setNumOfBins()
        global numOfIntervals
        toCheck = e2.get()
        if toCheck == "":
            showinfo(
                "Naive Bayes Classifier",
                "Please insert an integer for the Discretization bins attribute"
            )
            return
        numOfIntervals = int(toCheck)
    except:
        showinfo("Naive Bayes Classifier",
                 "Discretization bins must be an integer")
        return
    if numOfIntervals < 2:
        showinfo("Naive Bayes Classifier",
                 "Discretization bins must be at least 2")
        return

    if os.stat(pathToStructure).st_size == 0:
        showinfo("Naive Bayes Classifier",
                 "The file Structure.txt is empty. Please load valid files")
        return
    structure_file = open(pathToStructure, "r")
    try:
        dfTrain = pd.read_csv(pathToTrain)
    except Exception as e:
        if e.__str__() == "No columns to parse from file":
            showinfo("Naive Bayes Classifier",
                     "The file train.csv is empty. Please load valid files")
        else:
            showinfo("Naive Bayes Classifier",
                     "The file train.csv has errors. Please load valid files")
    totalNumOfRecords_train = dfTrain.shape[0]  # num of records
    if numOfIntervals > totalNumOfRecords_train:
        showinfo(
            "Naive Bayes Classifier",
            "Discretization bins must not be grater than the number of train set records"
        )
        return
    global dfTrainFinal
    dfTrainFinal = pp.preProcess(structure_file, dfTrain, numOfIntervals)
    structure_file = open(pathToStructure, "r")
    attribute_values_dict = pp.set_attribute_values_dict(structure_file)

    cl.prepareModel(dfTrainFinal, pathToStructure, numOfIntervals,
                    attribute_values_dict)
    classify_Button.config(state='normal')
    showinfo("Naive Bayes Classifier",
             "Building classifier using train-set is done!")
Exemple #4
0
def documentFrequency(data, terms):
    df = []
    documents = [documents.document for documents in data]

    for term in terms:
        dfWeight = 0
        for document in documents:
            document = pre.split(pre.stemming(pre.filtering(pre.tokenization(document))))
            if term in document:
                dfWeight += 1
        df.append(dfWeight)
    return df
Exemple #5
0
def rawTermWeighting(data, terms):
    rawWeight = []
    documents = [documents.document for documents in data]

    for document in documents:
        documentWeight = []
        document = pre.split(pre.stemming(pre.filtering(pre.tokenization(document))))

        for term in terms:
            documentWeight.append(document.count(term))

        rawWeight.append(documentWeight)

    return rawWeight
Exemple #6
0
def preprocess():
    word="code"
    person="wg_"

    # TODO: set picture to the same size regardless of its length
    # tList, xList, yList, zList = pre.readFileGenByAcc("./hotwords_rsc/"+person+word+".csv")
    tList, xList, yList, zList=pre.readFile("./S9one100five.tsv")
    tList, freqList, xList, yList, zList = pre.standardize(tList, xList, yList, zList, highpass=5 / 1000)
    length=len(tList)
    # fig,ax=plt.subplots(3,1)
    # ax[0].plot(freqList[0:round(length / 2)], xList[0:round(length / 2)], color='red')
    # ax[1].plot(freqList[0:round(length / 2)], yList[0:round(length / 2)], color='green')
    plt.plot(freqList[0:round(length/2)],np.abs(zList[0:round(length/2)]),color='blue')

    plt.show()
Exemple #7
0
def extract():
    word = "code"
    person = "ty_"

    prefix="F:/2020AccelEve/database/fixed_rate/S9/"
    filename="S9one"
    testFreq=100
    testCase=1
    postfix=".tsv"
    for i in range(4):
        testCase=1
        for j in range(5):
            STR=prefix+filename+(str)(testFreq)+'_'+(str)(testCase)+postfix
            # TODO: set picture to the same size regardless of its length
            tList, xList, yList, zList=pre.readFile(STR)
            # tList, xList, yList, zList = pre.readFileGenByAcc("./hotwords_rsc/" + person + word + ".csv")
            # =============time domain==================
            t=[]
            t.append(getMean(zList))
            t.append(getStdDev(zList))
            t.append(getKurtosis(zList))
            t.append(getSkewness(zList))
            t.append(getAveDev(zList))
            t.append(getRMS(zList))
            t=np.array(t)
            # =================freq domain==================
            f=[]
            tList, freqList, xList, yList, zList = pre.standardize(tList, xList, yList, zList, highpass=10 / 1000)
            length=len(zList)
            zList=zList[0:round(length/2)]
            freqList=freqList[0:round(length/2)]
            f.append(getSpecStdDev(np.abs(zList),freqList))
            f.append(getSpecCentroid(np.abs(zList),freqList))
            f.append(getSpecSkewness(np.abs(zList),freqList))
            f.append(getSpecKurt(np.abs(zList),freqList))
            f.append(getSpecCrest(np.abs(zList),freqList))
            f.append("Nothing")
            f=np.array(f)

            dic={'FreqDomain':f,'TimeDomain':t}
            DF=pd.DataFrame(data=dic)

            savedName=prefix+"FeatureVector/"+filename+(str)(testFreq)+'_'+(str)(testCase)+'.csv'
            DF.to_csv(path_or_buf=savedName)
            # DF.to_excel(excel_writer='./test.xlsx')
            # print(DF)
            testCase += 1
        testFreq += 100
Exemple #8
0
def GetDigits(img):
	img = PreProcessing.Binarization(img)
	cv2.imshow('', img) 
	rows,cols = img.shape[:] 
	cols_coordinates = []
	number=''
	flag = True
	for j in range(0,cols):

		temp_sum = 0
		for i in range(0,rows):
			temp_sum += img[i][j]

		if temp_sum == 255*rows and flag:
			cols_coordinates.append(j+5)
			flag = False
		if temp_sum<255*rows:
			flag = True

	print(cols_coordinates)
	if len(cols_coordinates)>0: 
		x = cols_coordinates[0] 

	for i in range(1,len(cols_coordinates)):
		w = cols_coordinates[i]
		crop_img = img[:,x:w]
		x = w 
		crop_img = FindBoundary(crop_img)
		crop_img = cv2.resize(crop_img, (20,20)) 
		temp_crop_img = []
		for i in range(28):
			temp = []
			for j in range(28):
				temp.append(0)
			temp_crop_img.append(temp)

		for i in range(20):
			for j in range(20):
				if crop_img[i][j]>127:
					crop_img[i][j] = 255
				else:
					crop_img[i][j] = 0

				if crop_img[i][j]==0:
					temp_crop_img[4+i][4+j] = 255  
		cv2.imshow('Boundary', np.float32(temp_crop_img))
		cv2.waitKey(0) 
		pred_digit=get_Prediction(temp_crop_img)
		count=0
		digit=str(pred_digit[0])
		for i in range(20):
			for j in range(20): 
			    if temp_crop_img[4+i][4+j] == 255 :
				    count=count+1
		if count>=(200):
			digit="1"
		print(digit,count) 
		#Appending each digit to form a  number
		number=number + digit 
	return number	
Exemple #9
0
def single_model(df, model_type, target_col, cont_feat, cat_feat, refit):
    '''
    Runs a grid search of a single type of model.
    Inputs:
        df: a Pandas dataframe
        model_type (str): the type of model to be run
        target_col (str): the name of the target column
        cont_feat (list): list of continuous features
        cat_feat (list): list of categorical features
        refit (str or False): how the best model should be refit
            For decision tree refit can be one
    Returns:
        best_model: model object of the best model
        dataframe of feature importances
    '''
    train, test = PreProcessing.tt_split(
        df[[target_col] + cont_feat + cat_feat], 30)
    normalize_cont = True
    if model_type == "RandomForest" or model_type == "DecisionTree":
        normalize_cont = False
    train_X, train_Y, test_X, test_Y, labels = pre_processing(
        target_col, train, test, cont_feat, cat_feat, normalize_cont)
    grid = build_model(train_X, train_Y, refit, model_type)
    best_model = eval_model(grid, test_X, test_Y, model_type)
    fixed_val_threshold(best_model, test_X, test_Y)
    feature_headers = list(labels)
    feature_headers.remove(target_col)

    return best_model, pd.DataFrame(
        index=feature_headers,
        data=best_model.feature_importances_).sort_values(by=0,
                                                          ascending=False)
Exemple #10
0
def try_four_models(df, target_col, cont_feat, cat_feat, refit):
    """Copied from log_model to call big_grid_search instead of build_log_model"""
    train, test = PreProcessing.tt_split(
        df[[target_col] + cont_feat + cat_feat], 30)
    train_X, train_Y, test_X, test_Y, labels = pre_processing(
        target_col, train, test, cont_feat, cat_feat)
    big_grid_search(train_X, train_Y, test_X, test_Y, refit=refit)
Exemple #11
0
 def pre_processing(self):
     if self.input_tests(self.entryClusterNumber.get(),
                         self.entryRunsNumber.get(), self.file_path):
         self.preProcessing = PreProcessing(self.df).clean()
         message_box.showinfo("K Means Clustering",
                              "Preprocessing completed successfully!")
         self.cluster_button.config(state=NORMAL)
Exemple #12
0
def cal_tenengrad(img):
    if len(img.shape) == 3:
        (img, _, _) = cv.split(img)
    temp = pp.image_to_matrix(img)
    temp_sobel = filters.sobel(temp)
    source = np.sum(temp_sobel ** 2)
    metric = np.sqrt(source)
    return metric
Exemple #13
0
def binaryTermWeighting(data, terms):
    binaryWeight = []
    documents = [documents.document for documents in data]

    for document in documents:
        documentWeight = []
        document = pre.split(pre.stemming(pre.filtering(pre.tokenization(document))))

        for term in terms:
            if term in document:
                documentWeight.append(1)
            else:
                documentWeight.append(0)

        binaryWeight.append(documentWeight)

    return binaryWeight
Exemple #14
0
def main():

    pre_processed_data, pre_processed_data_matrix = PreProcessing.pre_process(
        KGRAM_RANGE, TOKEN_TYPE)
    processed_data = Mining.KGramClusteringExperiment(
        pre_processed_data, pre_processed_data_matrix)

    return
Exemple #15
0
def clusters_in_two_dim_no_url():
    data, dataMatrix, features = PreProcessing.pre_process_content_only_no_url(
        (2, 2), file_name='IRAhandle_tweets_all.csv')
    print('PreProcessing Done')
    #Mining.SimpleKGram(data,dataMatrix,4)
    data = Mining.project_to_two_dimensions(data, dataMatrix)
    print('Mining Done')
    PostAnalysis.plot_2D(data, '2dPlotSimpleClusteringCR22.png')
    data.to_csv('simpleClusteringK20WithCords.csv')
Exemple #16
0
 def generateWordCount(self, train_data, stemming):
     word_and_word_count = {}
     document = ''
     for td in train_data:
         document = document + ' ' + td
     document = pp.StopWordAndSpecialCharRemoval(document, stemming)
     word_array = numpy.array(document.split())
     unique, counts = numpy.unique(word_array, return_counts=True)
     return dict(zip(unique, counts))
Exemple #17
0
def GenSet(times, iteration):
    print("分组:" + str(times))

    coal_prefix = 'D:\\coal-gangue\\selected\\coal\\'
    gangue_prefix = 'D:\\coal-gangue\\selected\\gangue\\'
    coal_num = 184
    gangue_num = 182

    suffix = '.jpg'
    train = []
    label = []
    testset = []
    for i in range(coal_num):
        path = coal_prefix + str(i) + suffix
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        img = pp.prep(img)
        # 这里使用了区域关系重采样做插值
        #img = cv2.resize(img, (150, 150), interpolation=cv2.INTER_AREA)
        if i % iteration != times:
            train.append(fe.Rotation_invariant_LBP(img))
            label.append("coal")
        else:
            t0 = time.time()
            tmp = [i, fe.Rotation_invariant_LBP(img), "coal"]
            t1 = time.time()
            tmp.append(t1 - t0)
            testset.append(tmp)
    for i in range(gangue_num):
        path = gangue_prefix + str(i) + suffix
        img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
        img = pp.prep(img)
        # 这里使用了区域关系重采样做插值
        #img = cv2.resize(img, (150, 150), interpolation=cv2.INTER_AREA)
        if i % iteration != times:
            train.append(fe.Rotation_invariant_LBP(img))
            label.append("gangue")
        else:
            t0 = time.time()
            tmp = [i, fe.Rotation_invariant_LBP(img), "gangue"]
            t1 = time.time()
            tmp.append(t1 - t0)
            testset.append(tmp)
    print(str(times) + "分组完成!")
    return np.asarray(train), np.asarray(label), testset
Exemple #18
0
def pipeline(img, isVideo=False):
    # Image Preprocessing
    undst, binary, binary_warped = PreProcessing.preprocess_image(img)

    # Lane Detection Code Start
    lanes, leftx, lefty, rightx, righty, ploty = LaneFinding.get_lane_lines(
        binary_warped, isVideo)

    lcurve, rcurve = Support.get_real_lanes_curvature(ploty, leftx, lefty,
                                                      rightx, righty)

    output = draw_lane_area(undst, binary_warped, ploty, leftx, lefty, rightx,
                            righty, isVideo)

    left_fit, right_fit, dummy = Support.fit_polylines(binary_warped.shape[0],
                                                       leftx,
                                                       lefty,
                                                       rightx,
                                                       righty,
                                                       x_scale_factor=1,
                                                       y_scale_factor=1)

    left_fitx, right_fitx = Support.get_polylines_points(
        ploty, left_fit, right_fit)

    if (isVideo is True):
        lcurve, rcurve = getSmoothedCurveData(lcurve, rcurve)
        left_fitx, right_fitx = getSmoothedLanesData(left_fitx, right_fitx)

    shiftFromLaneCenter_m, side = calculate_shift_from_lane_center(
        binary_warped, left_fitx, right_fitx)

    Font = cv2.FONT_HERSHEY_SIMPLEX
    color = (255, 255, 255)
    cv2.putText(output, 'curve = ' + str((lcurve + rcurve) / 2) + ' m',
                (10, 100), Font, 1, color, 2, cv2.LINE_AA)

    cv2.putText(
        output, 'Vehicle is ' + str(shiftFromLaneCenter_m) + ' (m) ' + side +
        ' of lane center', (10, 150), Font, 1, color, 2, cv2.LINE_AA)
    # Lane Detection Code End

    # Vehicle Detection Code Start
    cars_boxs = get_classified_cars_boxs(undst)
    classified_boxs = Visualisation.draw_boxes(undst,
                                               cars_boxs,
                                               color=(0, 0, 255),
                                               thick=6)
    filtered_boxs, heat_map = get_heat_map_boxs(cars_boxs, undst, isVideo)
    output = Visualisation.draw_boxes(output,
                                      filtered_boxs,
                                      color=(0, 0, 255),
                                      thick=6)
    # Vehicle Detection Code End

    return undst, classified_boxs, heat_map, output
Exemple #19
0
def cutImage(image_bin, nPixel, space, verticalCut: bool = False):
    '''
       Evaluates where the image can be cut
       
       Parameters
       ----------
       img_bin: array of binarized image pixels
       nPixel : integer value of the threshold of black pixels below which it is         possible to make a cut
       space: integer value of the threshold of consecutive white pixels not to be     cut
       verticalCut : boolean value to rotate the image to evaluate vertical cuts (default = True) (optional)
       
       Returns
       info: triplet array [nWhitePixel,pStart, pEnd], nWhitePixel indicates the number of consecutive white pixels, pStart indicates the first white pixel, pEnd indicates the last white pixel
       
       '''
    if verticalCut:
        image_bin = image_bin.T

    #Counting black pixels per row (axis=0: col, axis=1:row)
    counts, _ = pp.projection(image_bin)

    #cut contains all lines that have less than nPixel pixels
    cut = []
    for i in range(counts.shape[0]):
        if (counts[i] < nPixel):
            cut.append(i)
    x = 0
    h = 0
    info = []
    flag = False
    for j in range(len(cut) - 1):
        if cut[j + 1] - cut[j] == 1:
            if flag == False:
                h = cut[j]
                flag = True
            x = x + 1
        else:
            info.append([x, h, cut[j]])
            flag = False
            x = 0
    info.append([x, h, cut[j]])

    delete = []
    for k in range(len(info)):
        if info[k][0] < space:
            delete.append(k)
    for m in range(len(delete) - 1, -1, -1):
        info.remove(info[delete[m]])
    #print('[[nWhitePixel,pStart, pEnd]]:',info)

    if verticalCut:
        image_bin = image_bin.T
        ut.cv.imwrite('verticalCut.tif', image_bin)
    else:
        ut.cv.imwrite('horizontalCut.tif', image_bin)
    return info
Exemple #20
0
def logTermWeighting(data, terms):
    logWeight = []

    documents = [documents.document for documents in data]

    for document in documents:
        documentWeight = []
        document = pre.split(pre.stemming(pre.filtering(pre.tokenization(document))))

        for term in terms:
            count = document.count(term)
            if count > 0 :
                documentWeight.append(1 + math.log10(count))
            else :
                documentWeight.append(0)

        logWeight.append(documentWeight)

    return logWeight    
Exemple #21
0
def single_model(df, model_type, target_col, cont_feat, cat_feat, refit):
    """For decision tree refit can be one"""
    train, test = PreProcessing.tt_split(
        df[[target_col] + cont_feat + cat_feat], 30)
    train_X, train_Y, test_X, test_Y, labels = pre_processing(
        target_col, train, test, cont_feat, cat_feat)
    grid = build_model(train_X, train_Y, refit, model_type)
    best_model = eval_model(grid, test_X, test_Y, model_type)
    fixed_val_threshold(best_model, test_X, test_Y)
    return best_model, labels
Exemple #22
0
def training_model():
    x_train, x_val, x_test, y_train, y_val, y_test = PP.PreProcess(path_smiles)
    model = model.fit(x_train,
                      y_train,
                      epochs=5,
                      batch_size=64,
                      verbose=1,
                      validation_data=(x_val, y_val))
    model.save('back_end/Model/servier.h5')
    return
def processTrainingData(class_name_list, train_file_path):
    dict = {}
    doc_dict = {}
    count = 0
    for class_name in class_name_list:
        word_count_dict = {}
        count += 1
        folder_path = join(train_file_path, class_name)
        filenames = PreProcessing.getFileNames(folder_path)
        for f in filenames:
            filepath = join(folder_path, f)
            filtered_word_list = PreProcessing.readData(filepath)
            word_count_dict = addToDictionary(filtered_word_list,
                                              word_count_dict)

        dict[count] = word_count_dict
        doc_dict[count] = len(filenames)
        print("Class: {0}, Total docs:{1}".format(class_name, doc_dict[count]))

    return (dict, doc_dict)
Exemple #24
0
def cal_smd2(img):
    if len(img.shape) == 3:
        (img, _, _) = cv.split(img)
    metric = 0.0
    x, y = img.shape
    img = pp.image_to_matrix(img)

    for i in range(x-1):
        for j in range(y-1):
            metric += abs(int(img[i+1, j]) - int(img[i, j])) * abs(int(img[i, j]) - int(img[i+1, j]))
    return metric/1000
Exemple #25
0
def cal_vollath(img):
    if len(img.shape) == 3:
        (img, _, _) = cv.split(img)
    img = pp.image_to_matrix(img)
    x, y = img.shape
    source = 0
    for i in range(x-1):
        for j in range(y):
            source += img[i, j] * img[i + 1, j]
    metric = source - x * y * np.mean(img)
    return metric
Exemple #26
0
def cal_energy_gradient(img):
    if len(img.shape) == 3:
        (img, _, _) = cv.split(img)
    metric = 0.0
    x, y = img.shape
    img = pp.image_to_matrix(img)

    for i in range(x-1):
        for j in range(y-1):
            metric += (int(img[i+1, j]) - int(img[i, j]))**2 * (int(img[i, j]) - int(img[i+1, j]))**2

    return metric/100000
Exemple #27
0
def remove(img):
    rows, cols = img.shape[:2]
    BinarizedImage = PreProcessing.Binarization(img)

    upper_height = 0
    for i in range(rows):
        temp_sum = 0
        for j in range(cols):
            temp_sum += BinarizedImage[i, j]
        if temp_sum < int(0.15 * 255 * cols):
            upper_height = i + 5
        else:
            break

    lower_height = 0
    for i in range(rows):
        temp_sum = 0
        for j in range(cols):
            temp_sum += BinarizedImage[rows - 1 - i, j]
        if temp_sum < int(0.15 * 255 * cols):
            lower_height = i + 5
        else:
            break

    left_width = 0
    for j in range(cols):
        temp_sum = 0
        for i in range(rows):
            temp_sum += BinarizedImage[i, j]
        if temp_sum < int(0.03 * 255 * rows):
            left_width = j + 5
        else:
            break

    right_width = 0
    for j in range(cols):
        temp_sum = 0
        for i in range(rows):
            temp_sum += BinarizedImage[i, cols - 1 - j]
        if temp_sum < int(0.03 * 255 * rows):
            right_width = j + 5
        else:
            break

    x = left_width
    y = upper_height
    h = rows - 1 - lower_height
    w = cols - 1 - right_width

    return img[y:h, x:w]
def applyPreProc():
    """
    Desc : Apply Preprocessing
    """
    print('\n ********* Preprocessing **********')
    #fileName='/nobackup/anikgaik/search/features/Train_Features/Train_Features_Modified.csv'
    #writeFile='/nobackup/anikgaik/search/features/Train_Features/Train_Features_Replacing_Missing.csv'
    fileName='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Selected_Features.csv'
    writeFile1='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Train_Features_Modified.csv'
    writeFile2='D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Final_Train_Features.csv'

    oPP=PreProc.preprocessing()
    oPP.removeFeature(fileName,writeFile1)
    oPP.featureDiscretization(writeFile1,writeFile2)
Exemple #29
0
def log_model(df, target_col, cont_feat, cat_feat, refit):
    '''
    wrapper function for all of it
    '''
    #cont_feat.extend(bin_names) #bin_names comes from data prep
    train, test = PreProcessing.tt_split(
        df[[target_col] + cont_feat + cat_feat], 30)
    pare_df(df, cont_feat, cat_feat, target_col)
    train_X, train_Y, test_X, test_Y = pre_processing(target_col, train, test,
                                                      cont_feat, cat_feat)
    grid = build_log_model(train_X, train_Y, refit=refit)
    best_log = eval_log_model(grid, test_X, test_Y)
    fixed_val_threshold(best_log, test_X, test_Y)
    return None
def applyPreProc():
    """
    Desc : Apply Preprocessing
    """
    print('\n ********* Preprocessing **********')
    #fileName='/nobackup/anikgaik/search/features/Train_Features/Train_Features_Modified.csv'
    #writeFile='/nobackup/anikgaik/search/features/Train_Features/Train_Features_Replacing_Missing.csv'
    fileName = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Selected_Features.csv'
    writeFile1 = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Train_Features_Modified.csv'
    writeFile2 = 'D:\\Fall 2015\\Search\\YELP Challenge\\Yelp-Dataset-Challenge-2015\\Task_2\\Final_Train_Features.csv'

    oPP = PreProc.preprocessing()
    oPP.removeFeature(fileName, writeFile1)
    oPP.featureDiscretization(writeFile1, writeFile2)
Exemple #31
0
def main():
    global tList, xList, yList, zList
    tList, xList, yList, zList = pre.readFile('siri_digits/siri_one_up_bass1.tsv')

    tList, freqList, xList, yList, zList = \
        pre.standardize(tList, xList, yList, zList, highpass=85/ 1000)

    xList, yList, zList = pre.reverseFFT(xList, yList, zList)
    # pre.showMap(tList, xList, yList, zList, '85hz filter')
    # zSmooth, tSmooth = seg.smooth(zList, tList)

    cuttingpoints=[1196, 2021, 3909, 4711, 6614, 7425, 9275, 10085, 11947, 12760, 14604, 15440]


    cnt=0
    length=len(cuttingpoints)
    for i in range(0,length,2):
        x_one=xList[cuttingpoints[i]:cuttingpoints[i+1]]
        y_one=yList[cuttingpoints[i]:cuttingpoints[i+1]]
        z_one=zList[cuttingpoints[i]:cuttingpoints[i+1]]
        specX,specY,specZ=generateMap(x_one,y_one,z_one)
        generateRGB(specX, specY, specZ,'siri_digits/siribassRGB'+str(cnt)+'two.png')
        cnt+=1
def tokenizeToken(word):
	result = []

	#do some preprocessing (tries to filter out variable names and stuff like that)
	preProcessing = PreProcessing.preProcess(word)

	#for every term that survived preprocessing, first try to split on possible seperators
	#then split on UCLC boundaries (camel case) and if that doesn't produce known words,
	#then split them um further with the greedy algorithm
	for t1 in preProcessing:
		firstStep = splitOnSeparators(t1)
		for t2 in firstStep:
			result += splitOnUCLC(t2)
#			for t3 in secondStep:
#				result += refineUnknown(t3)
	return result
Exemple #33
0
    
    parser.add_argument('inputFile', help="The input text file, usually a normal text file encoded in utf-8")
    parser.add_argument('outputFile', help="The parsed output CoNLL file encoded in utf-8.")
    parser.add_argument('--model', help="Indicate the model to be used if it is not the default model")
    parser.add_argument('--tagged', action="store_true", help="Indicate to the parser that your data is already in CoNLL format in parsing mode.")
    parser.add_argument('turboOpt', nargs="*", help="Additional options to pass to TurboParser (Without the preceding hyphens: '--evaluate' becomes 'evaluate')")
    
    return parser

if __name__ == "__main__":
    parser = argumentSetup().parse_args()
    taggedFile = parser.inputFile
    # parsing sequence
    if not parser.tagged:
        taggedFile = os.path.join(TEMP, "GSW_tagged"+os.path.basename(parser.inputFile))
        PreProcessing.main(parser.inputFile, taggedFile)
        
    # assign features
    FeatureConfig().run(taggedFile)
        
    model = DEFAULT_MODEL
    if parser.model:
        model = parser.model
        
    # call Turbo Parser
    args = ["--test", "--file_model={}".format(model), "--file_test={}".format(taggedFile), "--file_prediction={}".format(parser.outputFile)]  + map(lambda x: "--"+x, parser.turboOpt)
    
    print "Called TurboParser with options: "+" ".join(args)
        
    subprocess.call([TURBOP]+ args)
    
Exemple #34
0
			DOWN_TRAIN_TOKENIZED
		Validation:
			UP_VALIDATE_RAW
			DOWN_VALIDATE_RAW
			UP_VALIDATE_SENTENCES
			DOWN_VALIDATE_SENTENCES
			UP_VALIDATE_TOKENIZED
			DOWN_VALIDATE_TOKENIZED
		Test:
			Email number is array index + 1
			TEST_RAW
			TEST_SENTENCES
			TEST_TOKENIZED
	 '''

	UP_TRAIN_RAW,DOWN_TRAIN_RAW,UP_TRAIN_SENTENCES,DOWN_TRAIN_SENTENCES,UP_TRAIN_TOKENIZED,DOWN_TRAIN_TOKENIZED,UP_VALIDATE_RAW,DOWN_VALIDATE_RAW,UP_VALIDATE_SENTENCES,DOWN_VALIDATE_SENTENCES,UP_VALIDATE_TOKENIZED,DOWN_VALIDATE_TOKENIZED,TEST_RAW,TEST_SENTENCES,TEST_TOKENIZED=PreProcessing.process()

	UP_TRAIN_TOTAL_TOKENIZED = UP_TRAIN_TOKENIZED + UP_VALIDATE_TOKENIZED
	DOWN_TRAIN_TOTAL_TOKENIZED = DOWN_TRAIN_TOKENIZED + DOWN_VALIDATE_TOKENIZED

	VALIDATE_LABELS_UP=[1] * len(UP_VALIDATE_RAW)
	VALIDATE_LABELS_DOWN=[0] * len(DOWN_VALIDATE_RAW)


	print('\n\n========== Unsmoothed N-Grams==========\n\n')

	print('-----===== UP_TRAIN =====-----\n\n')
	upTrainUnigram,upTrainBigram,upTrainTrigram = NGram.getNGram(UP_TRAIN_TOKENIZED)
	print('\n\n-----===== DOWN_TRAIN =====-----\n\n')
	downTrainUnigram,downTrainBigram,downTrainTrigram = NGram.getNGram(DOWN_TRAIN_TOKENIZED)
Exemple #35
0
	def process(self, image):
		image = pr.gamma_correction(0.2,image)
		image = pr.dog_filter(image)
		image = pr.histogram_equalize(image)
		return image