Ejemplo n.º 1
0
 def pop_up_button1(self, i):
     if i.text() == "OK":
         self.msg1.close()
         MainWindow.close()
         self.time_to_wait_before = self.time_to_wait_combobox.currentText()
         pre_process(self.file_path, self.time_to_wait_before,
                     self.audio_path)
Ejemplo n.º 2
0
def read_test_train():
    train_path = "fbpac-ads-en-US-train.csv"
    test_path = "fbpac-ads-en-US-test.csv"
    # data_path = "data/limited_sample.csv"
    data_train = pd.read_csv(train_path, error_bad_lines=False)
    data_test = pd.read_csv(test_path, error_bad_lines=False)

    # pre processing all the documents [title:04 + message:05]
    processed_docs = []

    for index, row in data_train.iterrows():
        try:
            processed_record = pre_process(row[5])
            processed_docs.append(processed_record)
        except:
            print("Error in pre-processing: " + str(index))
    for index, row in data_test.iterrows():
        try:
            processed_record = pre_process(row[5])
            processed_docs.append(processed_record)
        except:
            print("Error in pre-processing: " + str(index))

    print("Log: pre processing is done.")
    return processed_docs
Ejemplo n.º 3
0
def windowed_subdivs(model, input_ch, train_mean, train_std, padded_img,
                     window_size, overlap_pct):
    """
    Create tiled overlapping patches.

    Returns:
        5D numpy array of shape = (
            nb_patches_along_X,
            nb_patches_along_Y,
            patches_resolution_along_X,
            patches_resolution_along_Y,
            nb_output_channels
        )

    Note:
        patches_resolution_along_X == patches_resolution_along_Y == window_size
    """

    step = int(window_size * (1 - overlap_pct))
    padx_len = padded_img.shape[0]
    pady_len = padded_img.shape[1]
    subdivs = []

    if input_ch == 3:
        for i in range(0, padx_len - window_size + 1, step):
            subdivs.append([])
            for j in range(0, pady_len - window_size + 1, step):
                patch = padded_img[i:i + window_size, j:j + window_size]
                patch = pre_process(patch, train_mean, train_std) / 255
                patch = cv2.merge((patch, patch, patch))
                subdivs[-1].append(patch)
    else:
        for i in range(0, padx_len - window_size + 1, step):
            subdivs.append([])
            for j in range(0, pady_len - window_size + 1, step):
                patch = padded_img[i:i + window_size, j:j + window_size]
                patch = pre_process(patch, train_mean, train_std) / 255
                patch = np.expand_dims(patch, axis=-1)
                subdivs[-1].append(patch)

    # Here, `gc.collect()` clears RAM between operations.
    # It should run faster if they are removed, if enough memory is available.
    gc.collect()
    subdivs = np.array(subdivs)
    gc.collect()
    a, b, c, d, e = subdivs.shape
    subdivs = subdivs.reshape(a * b, c, d, e)
    gc.collect()

    subdivs = model.predict(subdivs)
    gc.collect()

    # Such 5D array:
    subdivs = subdivs.reshape(a, b, c, d, 1)
    gc.collect()

    return subdivs
Ejemplo n.º 4
0
def read_seeds_data():
    all_docs = []
    docs_labels = []
    with open('data/seeds.json') as f:
        data = json.load(f)
        try:
            for item in data["not_political"]:
                all_docs.append(pre_process(item))
                docs_labels.append(0)
            for item in data["political"]:
                all_docs.append(pre_process(item))
                docs_labels.append(1)
        except:
            print("Error in reading data.")
    return all_docs, docs_labels
Ejemplo n.º 5
0
def main():

    # preprocess includes: spliting and using standard scaler to transform

    x_train, x_test, y_train, y_test = pre_process()

    ### feature selection with low variance

    #print(x_train.shape)

    x_train, x_test = delete_low_variance(x_train, x_test)

    #print(x_train.shape)
    #print(x_test.shape)  #this deleted 218 features
    criteria = 'entropy'
    model = DecisionTreeClassifier(criterion=criteria)
    model.fit(x_train, y_train)
    prediction = model.predict(x_test)

    #visualize_tree(model,features)

    #print('the criteria is: ' + criteria)
    #print('the mean accuracy is: %.10f' % model.score(x_test,y_test))
    #print("F-1 score(micro) for test is: %.10f " % f1_score(y_test,prediction,average= 'micro'))
    print("F-1 score(weighted) for test is: %.10f " %
          f1_score(y_test, prediction, average='weighted'))
Ejemplo n.º 6
0
def main():

    # preprocess includes: spliting and using standard scaler to transform

    x_train, x_test, y_train, y_test = pre_process()


    ### feature selection with low variance

    #print(x_train.shape)

    x_train, x_test = delete_low_variance(x_train, x_test)

    #print(x_train.shape)
    #print(x_test.shape)  #this deleted 218 features
    strength = 1
    model = LogisticRegression(C=strength, penalty='l1', solver="liblinear", multi_class="ovr")
    model.fit(x_train, y_train.values.ravel())
    prediction = model.predict(x_test)
    #print(model.get_params)
    #print("Accuracy score for test is: %.6f" % model.score(x_test, y_test))
    #print("Strength is: %.2f" % strength)
    #print("F-1 score(micro) for test is: %.10f " % f1_score(y_test,prediction,average= 'micro'))
    #print("F-1 score(macro) for test is: %.10f " % f1_score(y_test,prediction,average= 'macro'))
    print "F-1 score(weighted) for test is: %.10f " % f1_score(y_test,prediction,average= 'weighted')
Ejemplo n.º 7
0
def using_bag_of_words(X):
    # preprocess, words needs to be tokenized
    proc_data = pre_process(X, tokenize=True, stop_words=stop_words)

    # get BOW vector
    vec_X = bag_of_words(proc_data)

    return vec_X
Ejemplo n.º 8
0
def run_model():
    x_train, x_test, y_train, y_test = pre_process()
    #x_train, x_test = delete_low_variance(x_train, x_test)

    clf = RandomForestClassifier(criterion="entropy")
    clf.fit(x_train, y_train.values.ravel())
    y_pred = clf.predict(x_test)
    print "Weighted F-1 Score:", f1_score(y_test, y_pred, average="weighted")
Ejemplo n.º 9
0
def docs_to_topics_vector(docs, lda_model, dictionary):
    """ given a list of documents and a trained topic mode, this method return the topic vector
        representation of all documents"""
    docs_topics_vectors = []
    for doc in docs:
        bow_vector = dictionary.doc2bow(pre_process(doc))
        docs_topics_vectors.append(lda_model[bow_vector])
    return docs_topics_vectors
Ejemplo n.º 10
0
def doc_topic_model(doc, lda_model, dictionary):
    """ given a sample document, trained LDA model and its corresponding dictionary, this method prints the topics of the
    documents and a score associated with each topic"""
    print("\n")
    bow_vector = dictionary.doc2bow(pre_process(doc))
    for index, score in sorted(lda_model[bow_vector],
                               key=lambda tup: -1 * tup[1]):
        print("Score: {}\t Topic: {}".format(score,
                                             lda_model.print_topic(index, 5)))
Ejemplo n.º 11
0
    def getDocBlocks(self, idx, blockSize=100, stride=20):
        doc = self.getContext(idx)[0]
        doc = pre_process(doc, stemming=False).split()
        i = 0
        docBlocks = []
        while i < len(doc):
            j = i + blockSize
            wordlist = doc[i : j] 
            docBlocks.append(' '.join(wordlist))
            i += stride

        return docBlocks
Ejemplo n.º 12
0
def run_model():
    print "kNN MODEL RESULTS"
    p_val = [1, 2, 3]
    n_val = [3, 5]
    for n in n_val:
        for dist in p_val:
            x_train, x_test, y_train, y_test = pre_process()
            x_train, x_test = delete_low_variance(x_train, x_test)
            clf = KNeighborsClassifier(p = dist, n_neighbors = n)
            clf.fit(x_train, y_train.values.ravel())
            y_pred = clf.predict(x_test)
            print "kNN: p =", dist, "neighbors =", n
            print "Weighted F-1 Score:", f1_score(y_test, y_pred, average = "weighted")
Ejemplo n.º 13
0
 def transform(self, X, **transform_params):
     docs_topics_vectors = []
     lda_model = load_file("models/LDAbow_fbpac.pickle")
     lda_dictionary = load_file("models/LDAdict_fbpac.pickle")
     for doc in X:
         try:
             bow_vector = lda_dictionary.doc2bow(pre_process(doc))
             docs_topics_vectors.append(lda_model[bow_vector])
         except Exception as e:
             print(e)
             print("Error in computing topic vector")
     n, nx, ny = np.array(docs_topics_vectors).shape
     d2_all_docs = np.array(docs_topics_vectors).reshape((n, nx * ny))
     return d2_all_docs[:, 1::2]
Ejemplo n.º 14
0
def create_topic_models():
    data_path = "data/fbpac-ads-en-US.csv"
    # data_path = "data/limited_sample.csv"
    data = pd.read_csv(data_path, error_bad_lines=False)

    # pre processing all the documents [title:04 + message:05]
    processed_docs = []
    for index, row in data.iterrows():
        try:
            processed_record = pre_process(row[4] + " " + row[5])
            processed_docs.append(processed_record)
        except:
            print("Error in pre-processing: " + str(index))
    print("Log: pre processing is done.")

    # creating a dictionary of all tokens in all documents
    dictionary = gensim.corpora.Dictionary(processed_docs)
    save_file('models/LDAdict.pickle', dictionary)
    dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=100000)
    print("Log: dictionary is created and saved.")

    # creating bag of words and tf-idf corpora
    bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
    tf_idf = models.TfidfModel(bow_corpus)
    corpus_tf_idf = tf_idf[bow_corpus]

    # creating LDA model using bag of words
    lda_model = gensim.models.LdaMulticore(bow_corpus,
                                           num_topics=2,
                                           id2word=dictionary,
                                           passes=2,
                                           workers=4)
    save_file('models/LDAbow.pickle', lda_model)
    print("Log: lda model [bog] is created and saved.")
    for idx, topic in lda_model.print_topics(-1):
        print('Topic: {} | Words: {}'.format(idx, topic))

    # creating LDA model using tf-idf
    lda_model_tf_idf = gensim.models.LdaMulticore(corpus_tf_idf,
                                                  num_topics=2,
                                                  id2word=dictionary,
                                                  passes=2,
                                                  workers=4)
    save_file('models/LDAtfidf.pickle', lda_model)
    print("Log: lda model [tf-idf] is created and saved.")
    for idx, topic in lda_model_tf_idf.print_topics(-1):
        print('Topic: {} | Word: {}'.format(idx, topic))
Ejemplo n.º 15
0
def using_doc2vec(X):
    # preprocess
    proc_data = pre_process(X, tokenize=True, stop_words=[])

    # no tokenize for doc2vec when creating model
    model, vec_X = doc2vec_create_model(X,
                                        max_epochs=100,
                                        vec_size=10,
                                        alpha=0.025)

    # example of loading in a model later
    model = Doc2Vec.load("Doc2Vec//apnews_dbow//doc2vec.bin")

    # tokenize when using model to convert other data
    vec_X = doc2vec_use_model(proc_data, model)

    return vec_X
Ejemplo n.º 16
0
def predict():
    '''
    For rendering results on HTML GUI
    '''
    questions = []
    questions.append([str(x) for x in request.form.values()])
    df = pd.DataFrame(questions, columns=['question1','question2'])
    df['id'] = 1
    df = pp.pre_process(df)
    df = fg.generate_features(df)
    X = df[['words_diff_q1_q2', 'word_common', 'word_total', 'word_share', 'cosine_distance', 'cityblock_distance',
            'jaccard_distance', 'canberra_distance', 'euclidean_distance', 'minkowski_distance', 'braycurtis_distance',
            'fuzz_qratio', 'fuzz_WRatio', 'fuzz_partial_ratio', 'fuzz_partial_token_set_ratio',
            'fuzz_partial_token_sort_ratio', 'fuzz_token_set_ratio', 'fuzz_token_sort_ratio', 'common_bigrams',
            'common_trigrams', 'q1_readability_score', 'q2_readability_score']]

    prediction = 'duplicate' if (model_rf.predict(X)[0] == 1) else 'not duplicate'
    return render_template('index.html', prediction_text=f'The questions are {prediction}')
Ejemplo n.º 17
0
def read_main_data():
    data = pd.read_csv(data_path, error_bad_lines=False)

    # pre processing all the documents [title:04 + message:05]
    processed_docs = []

    # printing unique list of advertisers
    advertisers = data.iloc[:, 16].unique()

    np.savetxt('data/advertisers.txt', advertisers, fmt='%s')

    for index, row in data.iterrows():
        try:
            processed_record = pre_process(row[5])
            processed_docs.append(processed_record)
        except:
            print("Error in pre-processing: " + str(index))
    print("Log: pre processing is done.")
    return processed_docs
Ejemplo n.º 18
0
def main():

    test_data_path = 'train.data.csv'
    test_scheme_path = 'wine.names.csv'

    # test_data_path = 'datasets/iris.data'
    # test_scheme_path = 'datasets/iris.names'

    data, attributes, value_type = read(test_data_path, test_scheme_path)
    random.shuffle(data)
    train_dataset = pre_process(data, attributes, value_type)

    cars = rule_generator(train_dataset, 0.22, 0.6)
    cars.prune_rules(train_dataset)
    cars.rules = cars.pruned_rules

    classifier_m1 = classifier_builder_m1(cars, train_dataset)

    # error_rate = get_error_rate(classifier_m1, train_dataset)

    total_car_number = len(cars.rules)
    # total_classifier_rule_num = len(classifier_m1.rule_list)

    # print("_______________________________________________________")
    # print(error_rate)

    # print("_______________________________________________________")
    # print(total_classifier_rule_num)

    print("_______________________________________________________")
    cars.print_rule()
    print("_______________________________________________________")
    cars.prune_rules(train_dataset)
    cars.print_pruned_rule()
    print("_______________________________________________________")
    print()
    classifier_m1.print()

    print("_______________________________________________________")
    print(total_car_number)
Ejemplo n.º 19
0
def minimum_cosine(query, data):
    query = pre_process(query)
    data = data.copy()
    data.append(query)

    #Transform all the answers and questions into tfidf vectors
    TfidfVec = TfidfVectorizer()
    tfidf = TfidfVec.fit_transform(data)

    #Store all the cosine angles in theta
    theta = []
    #Compute the cosine similarity between the query and the data.
    for i in range(len(data) - 1):
        val = cosine_similarity(tfidf[-1], tfidf[i])

        theta.append(math.acos(val))

    #Find the minimum angle and the index of that text from data and return it
    min_angle = min(theta)
    data_index = theta.index(min_angle)

    return min_angle, data_index
Ejemplo n.º 20
0
def cross_validate_m1_without_prune(data_path,
                                    scheme_path,
                                    minsup=0.01,
                                    minconf=0.5):
    data, attributes, value_type = read(data_path, scheme_path)
    random.shuffle(data)
    dataset = pre_process(data, attributes, value_type)

    block_size = int(len(dataset) / 10)
    split_point = [k * block_size for k in range(0, 10)]
    split_point.append(len(dataset))

    cba_rg_total_runtime = 0
    cba_cb_total_runtime = 0
    total_car_number = 0
    total_classifier_rule_num = 0
    error_total_rate = 0

    for k in range(len(split_point) - 1):
        print("\nRound %d:" % k)

        training_dataset = dataset[:split_point[k]] + dataset[split_point[k +
                                                                          1]:]
        test_dataset = dataset[split_point[k]:split_point[k + 1]]

        start_time = time.time()
        cars = rule_generator(training_dataset, minsup, minconf)
        end_time = time.time()
        cba_rg_runtime = end_time - start_time
        cba_rg_total_runtime += cba_rg_runtime

        start_time = time.time()
        classifier_m1 = classifier_builder_m1(cars, training_dataset)
        end_time = time.time()
        cba_cb_runtime = end_time - start_time
        cba_cb_total_runtime += cba_cb_runtime

        error_rate = get_error_rate(classifier_m1, test_dataset)
        error_total_rate += error_rate

        total_car_number += len(cars.rules)
        total_classifier_rule_num += len(classifier_m1.rule_list)

        print("CBA's error rate without pruning: %.1lf%%" % (error_rate * 100))
        print("No. of CARs without pruning: %d" % len(cars.rules))
        print("CBA-RG's run time without pruning: %.2lf s" % cba_rg_runtime)
        print("CBA-CB M1's run time without pruning: %.2lf s" % cba_cb_runtime)
        print("No. of rules in classifier of CBA-CB M1 without pruning: %d" %
              len(classifier_m1.rule_list))

    print("\nAverage CBA's error rate without pruning: %.1lf%%" %
          (error_total_rate / 10 * 100))
    print("Average No. of CARs without pruning: %d" %
          int(total_car_number / 10))
    print("Average CBA-RG's run time without pruning: %.2lf s" %
          (cba_rg_total_runtime / 10))
    print("Average CBA-CB M1's run time without pruning: %.2lf s" %
          (cba_cb_total_runtime / 10))
    print(
        "Average No. of rules in classifier of CBA-CB M1 without pruning: %d" %
        int(total_classifier_rule_num / 10))
Ejemplo n.º 21
0
 def pop_up_button2(self, j):
     if j.text() == "OK":
         self.msg2.close()
         MainWindow.close()
         pre_process(self.file_path, self.time_to_wait_before,
                     self.audio_path)
Ejemplo n.º 22
0
def cross_validate(data_path,
                   scheme_path,
                   class_first=False,
                   minsup=0.1,
                   minconf=0.6):
    data, attributes, value_type = read(data_path, scheme_path)
    if class_first:
        for i in range(len(data)):
            a = data[i].pop(0)
            data[i].append(a)
        a = attributes.pop(0)
        attributes.append(a)
        b = value_type.pop(0)
        value_type.append(b)
        # print(data[0])
    random.shuffle(data)
    dataset = pre_process(data, attributes, value_type)

    block_size = int(len(dataset) / 10)
    split_point = [k * block_size for k in range(0, 10)]
    split_point.append(len(dataset))

    cba_rg_total_runtime = 0
    cba_cb_total_runtime = 0
    total_car_number = 0
    total_classifier_rule_num = 0
    error_total_rate = 0
    acc_total = 0
    for k in range(len(split_point) - 1):
        print("\nRound %d:" % k)

        training_dataset = dataset[:split_point[k]] + dataset[split_point[k +
                                                                          1]:]
        test_dataset = dataset[split_point[k]:split_point[k + 1]]

        start_time = time.time()
        cars = rule_generator(training_dataset, minsup, minconf)
        end_time = time.time()
        cba_rg_runtime = end_time - start_time
        cba_rg_total_runtime += cba_rg_runtime

        start_time = time.time()
        classifier = classifier_builder_m1(cars, training_dataset)
        end_time = time.time()
        cba_cb_runtime = end_time - start_time
        cba_cb_total_runtime += cba_cb_runtime

        classifier.print()
        res = acc(classifier, test_dataset)
        acc_total += res

        error_rate = get_error_rate(classifier, test_dataset)
        error_total_rate += error_rate

        total_car_number += len(cars.rules)
        total_classifier_rule_num += len(classifier.rule_list)

        print("accuracy:", (res * 100))
        print("No. of CARs : ", len(cars.rules))
        print("CBA-RG's run time : s", cba_rg_runtime)
        print("CBA-CB M1's run time :  s", cba_cb_runtime)
        print("No. of rules in classifier of CBA-CB: ",
              len(classifier.rule_list))

    print("\n Average CBA's accuracy :", (acc_total / 10 * 100))
    print("Average No. of CARs : ", (total_car_number / 10))
    print("Average CBA-RG's run time: ", (cba_rg_total_runtime / 10))
    print("Average CBA-CB run time:  ", (cba_cb_total_runtime / 10))
    print("Average No. of rules in classifier of CBA-CB: ",
          (total_classifier_rule_num / 10))
Ejemplo n.º 23
0
# Params
max_word = 25
# Percent of data for train validation and test
train_pct, val_pct, test_pct = 0.9, 0.0, 0.10
batch_size = 32
n_class = 2
n_epoch = 10

# Other
saving_dir = 'saved_model'

# Load Movie Review Data
df = load_movie_review_data()
set_params(max_word=max_word)
word_array, sentiment = pre_process(df)

# Divide data into train, validation and test set
len_data = word_array.shape[0]
n_train_data = int(len_data * train_pct)
train_input = word_array[:n_train_data]
train_target = sentiment[:n_train_data]
val_data_index = int(len_data * (train_pct + val_pct))
n_val_data = val_data_index - n_train_data
val_input = word_array[n_train_data:val_data_index]
val_target = sentiment[n_train_data:val_data_index]
n_test_data = len_data - (n_train_data + n_val_data)
test_input = word_array[val_data_index:]
test_target = sentiment[val_data_index:]
# free memory space
word_array, sentiment = [], []
Ejemplo n.º 24
0
 def getQuestion(self, idx):
     return pre_process(self.trainData[idx]['question'], stemming=False)
def cross_validate_m1_without_prune(data_path, scheme_path,class_first=False, minsup=0.1, minconf=0.6):
    data, attributes, value_type = read(data_path, scheme_path)
    if class_first:
        for i in range(len(data)):
            a=data[i].pop(0)
            data[i].append(a)
        a=attributes.pop(0)
        attributes.append(a)
        b=value_type.pop(0)
        value_type.append(b)
        # print(data[0])
    random.shuffle(data)
    dataset = pre_process(data, attributes, value_type)

    block_size = int(len(dataset) / 10)
    split_point = [k * block_size for k in range(0, 10)]
    split_point.append(len(dataset))

    apr_rg_total_runtime = 0
    apr_cb_total_runtime = 0
    total_car_number = 0
    total_classifier_rule_num = 0
    error_total_rate = 0
    acc_total=0
    for k in range(len(split_point)-1):
        print("\nRound %d:" % k)

        training_dataset = dataset[:split_point[k]] + dataset[split_point[k+1]:]
        test_dataset = dataset[split_point[k]:split_point[k+1]]

        start_time = time.time()
        cars = rule_generator(training_dataset, minsup, minconf)
        end_time = time.time()
        apr_rg_runtime = end_time - start_time
        apr_rg_total_runtime += apr_rg_runtime

        arr=list(cars.rules_list)
        max=-1

        for i in range(len(arr)):
            if len(arr[i].cond_set)>max:
                max=len(arr[i].cond_set)
        T=[[] for i in range(max)]
        for i in range(len(arr)):
            T[len(arr[i].cond_set)-1].append(arr[i])
        u=[]
        for i in range(len(T)):
            T[i]=sort_dict(T[i])

            for j in T[i]:
                u.append(j)
        # print([u[i].cond_set for i in range(len(u))])
        apr_rg_total_runtime += apr_rg_runtime

        start_time = time.time()
        # print("----------")
        classifier= classifier_builder_m1(cars, training_dataset,minsup,len(training_dataset),u)


        end_time = time.time()
        apr_cb_runtime = (end_time - start_time)/10
        apr_cb_total_runtime += apr_cb_runtime

        classifier.print()
        res=acc(classifier,test_dataset)
        acc_total+=res

        error_rate = get_error_rate(classifier, test_dataset)
        error_total_rate += error_rate

        total_car_number += len(cars.rules)
        total_classifier_rule_num += len(classifier.rule_list)

        print("accuracy:",(res*100))
        print("No. of CARs : ",len(cars.rules_list))
        print("apr-RG's run time : s" ,apr_rg_runtime)
        print("apr-CB run time :  s" ,apr_cb_runtime)
        print("No. of rules in classifier of apr: " ,len(classifier.rule_list))

    print("\n Average APR's accuracy :",(acc_total/10*100))
    print("Average No. of CARs : ",(total_car_number / 10))
    print("Average apr-RG's run time : " ,(apr_rg_total_runtime / 10))
    print("Average apr-CB run time :  " ,(apr_cb_total_runtime / 10))
    print("Average No. of rules in classifier of apr: " ,(total_classifier_rule_num / 10))
Ejemplo n.º 26
0
# Main method in this file, to get data list after processing and scheme list.
# data_path: tell where *.data file stores.
# scheme_path: tell where *.names file stores.
def read(data_path, scheme_path):
    data = read_data(data_path)
    attributes, value_type = read_scheme(scheme_path)
    data = str2numerical(data, value_type)
    return data, attributes, value_type


# just for test
if __name__ == '__main__':
    import pre_processing

    test_data_path = 'zoo.data'
    test_scheme_path = 'zoo.names'
    test_data, test_attributes, test_value_type = read(test_data_path,
                                                       test_scheme_path)

    # for i in range(len(test_data)):
    #     a=test_data[i].pop(0)
    #     test_data[i].append(a)
    # a=test_attributes.pop(0)
    # test_attributes.append(a)
    # b=test_value_type.pop(0)
    # test_value_type.append(b)
    # print(test_data[0])
    result_data = pre_processing.pre_process(test_data, test_attributes,
                                             test_value_type)
    print(result_data)
def get_details():
    path = args.input
    all_values = []
    for i in os.listdir(path):
        single_img = []
        if i.endswith(".jpg"):
            #filename = i
            image_path = f'{os.path.join(path,i)}'
            image = pre_process(image_path)
            with io.open(image_path, 'rb') as image_file:
                content = image_file.read()
            image = vision.types.Image(
                content=content)  # construct an iamge instance
            # annotates Image Response
            response = client.text_detection(
                image=image)  # returns TextAnnotation
            texts = response.text_annotations
            all_ = {}
            nums = []
            dates = []
            pattern = get_pattern()
            for text in texts:
                if (re.match("^[A-Z]{2}[0-9]{1,2}[A-Z0-9]{1,3}[0-9]{3,4}$",
                             re.sub('\W+', '', text.description))):
                    all_['Reg No'] = text.description
                if (re.match("^[^\\Wioq]{11,18}$", text.description)
                        and re.search("[0-9]{5,6}$", text.description)):
                    nums.append(text.description)
                if (get_fields(pattern.fullmatch(text.description))):
                    dates.append(
                        get_fields(pattern.fullmatch(text.description)))
                if (re.match(
                        '^[0-9]{1,2}[-|\/]{1}[0-9]{1,2}[-|\/]{1}[0-9]{4}$',
                        text.description)):
                    dates.append(text.description)
            if len(nums) == 1:
                all_['VIN No/Chassis No'] = nums[0]
            elif len(nums) > 1 and len(nums) < 3:
                all_['VIN No/Chassis No'] = sorted(nums, key=len)[-1]
                all_['Engine No'] = sorted(nums, key=len)[-2]
            if len(dates) >= 1:
                for i in dates:
                    if (len(i) == 2):
                        i['day'] = '01'
                D = []
                for j in dates:
                    try:
                        if (len(j) == 3):
                            D.append(j['year'] + '-' + j['month'] + '-' +
                                     j['day'])
                            D.sort()
                            #print(j)
                    except KeyError:
                        D = D
                #for k in D:
                #all_.append(k)
                if len(D) > 1:
                    all_['MFG DT'] = D[0]
                    all_['REG DT'] = D[1]

            else:
                all_['MFG DT'] = dates[0]
            #print(all_)
            all_values.append(all_)
    with open('detailsss.txt', 'w+') as f:
        f.write(str(all_values))
    return all_values
Ejemplo n.º 28
0
gop = []
others = []
for index, row in advertiser_partisanship.iterrows():
    if row[2] == "Dem":
        dems.append(row[0])
    elif row[2] == "GOP":
        gop.append(row[0])
    elif row[2] == "nonpartisan" or row[2] == "other":
        others.append(row[0])

docs_topics_vectors = []
lda_model = load_file("models/LDAbow_fbpac.pickle")
lda_dictionary = load_file("models/LDAdict_fbpac.pickle")
for doc in train_df['text']:
    try:
        bow_vector = lda_dictionary.doc2bow(pre_process(doc))
        docs_topics_vectors.append(lda_model[bow_vector])
    except Exception as e:
        print(e)
        print("Error in computing topic vector")
n, nx, ny = np.array(docs_topics_vectors).shape
d2_all_docs = np.array(docs_topics_vectors).reshape((n, nx * ny))
X = d2_all_docs[:, 1::2]

x_filtered = []
x_advertiser = []
for i in range(n):
    result = np.sort(X[i])
    if not (round(X[i][3], 3) == 0.2 and round(X[i][4], 3) == 0.2):
        if str(advertiser_df[i]) != 'nan':
            x_filtered.append([X[i][3], X[i][4]])
Ejemplo n.º 29
0
    thresh = conf_matrix.max() / 2.
    for i in range(conf_matrix.shape[0]):
        for j in range(conf_matrix.shape[1]):
            ax.text(j, i, format(conf_matrix[i, j], fmt),
                    ha="center", va="center",
                    color="white" if conf_matrix[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

training_accs = []
test_accs = []
for leaf_nodes in range(2, 128):
    file_prefix = f"../output/max_leaf_nodes_{leaf_nodes}/"

    # Get data
    df = pre.pre_process()

    # Features for the training set
    X = df[["is_gender_female", "is_gender_male", "is_race_group A", "is_race_group B", "is_race_group C",
                     "is_race_group D", "is_race_group E", "is_parent_education_associate's degree",
                     "is_parent_education_bachelor's degree", "is_parent_education_high school",
                     "is_parent_education_master's degree", "is_parent_education_some college",
                     "is_parent_education_some high school", "is_lunch_free/reduced", "is_lunch_standard",
                     "is_prepared_completed", "is_prepared_none"]]
    features = X.keys()
    X = np.array(X)
    # Targets for the training set
    y = df["student performance"]
    y = np.array(y)

    # Hold one out cross validation
Ejemplo n.º 30
0
    for i in range(len(x)):
        # print(inx2word_dic[i])
        if i * x[i] != 0:
            x_inx.append(i)
    print(x_inx)
    sent = [inx2word_dic[i] for i in x_inx]
    return ' '.join(sent)


# 随机森林模型, 本身就是多类分类器
def random_forest_cla(training_data, model_name):
    X_train = training_data['X']
    y_train = training_data['y']
    forest_clf = RandomForestClassifier(random_state=42)
    forest_clf.fit(X_train, y_train)
    joblib.dump(forest_clf, model_name)


if __name__ == '__main__':
    print('Start to load training data...')
    data_dir_path = r'data'
    training_file = 'train.txt'
    training_data = pre_process(data_dir_path,
                                training_file,
                                min_freq=10,
                                count_in_sent=False)
    print('Training data\'s shape is', training_data['X'].shape)
    print('Start to fit model...')
    saved_model_name = 'saved_model.pkl'
    random_forest_cla(training_data, model_name=saved_model_name)
    print('Model saved in:', os.path.abspath(saved_model_name))