Python process Examples, preprocessing.process Python Examples

Example #1

0

Show file

File: preprocessing_sabines_dataset.py Project: sahaanirbannew/nli_legal_test_coliee_ovgu

def get_data(data_file, data_type="TRAIN"):
    file_path = data_file
    train_json = dict()
    xmlp = ET.XMLParser(encoding="utf-8")
    tree = ET.parse(file_path, parser=xmlp)
    root = tree.getroot()
    pbar = tqdm(total=len(root.findall('pair')))

    print('\nLoading premise and hypothesis sentences from disk...')
    print('-' * 55)
    for id, pair in enumerate(root.findall('pair')):
        #      pair_ID = pair.find('pair id').text
        text1 = pair.find('t1').text
        text2 = pair.find('t2').text

        preprocessed_premise = pre.process(text1)
        preprocessed_hyp = pre.process(text2)

        temp = dict()
        temp['text1'] = preprocessed_premise.lstrip()
        temp['text2'] = preprocessed_hyp.lstrip()

        if data_type == "TRAIN":
            label = pair.find('Label').text
            if label == 'Y': label = 1
            if label == 'N': label = 0
            temp['label'] = label

        train_json[id] = temp
        pbar.update(1)

    return train_json

Example #2

0

Show file

File: author_attribution.py Project: artxc/author_attribution

def run_classification():
    if not path.exists(PROCESSED_CORPUS_PATH):
        preprocessing.process(SRC_DIR)

    if not path.exists(FEATURES_FILE):
        x, y, le = extract_books_features_from_corpus()
        save_book_features_to_file(x, y, le)
    else:
        x, y = load_features_from_file()

    hybrid_classification(x, y)

Example #3

0

Show file

File: main.py Project: rajreet/ravi

def main():

    for image_count in range(1, 6):

        process_dict = {}
        image = cv2.imread(f"input/{image_count}.jpg", cv2.IMREAD_GRAYSCALE)
        # cv2.imwrite('ocr.png',image)

        # if (image.shape[0] < image.shape[1]):
        #     image =

        process_dict["image"] = image
        process_dict["image_height"] = image.shape[0]
        process_dict["image_width"] = image.shape[1]
        # retval, thresh = cv2.threshold(image,200,255,cv2.THRESH_BINARY)

        print(image.shape)
        # cv2.imwrite("thresh_200.png",thresh)

        threshold = cv2.adaptiveThreshold(image, 255,
                                          cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                          cv2.THRESH_BINARY, 201, 15)
        # blur = cv2.GaussianBlur(image,(7,7),0)
        # retval, threshold = cv2.threshold(blur,0,255,cv2.THRESH_BINARY | cv2.THRESH_OTSU)

        process_dict["binary"] = threshold

        process_dict["count"] = image_count

        process_dict = process(process_dict)
        print(f"Image preprocessing done for {image_count}.")

Example #4

0

Show file

File: cache_images.py Project: ktsc28/kits

    def __getitem__(self, case_num, size=(128, 128, 128)):
        if self.is_validation == True:
            case_num += 200
        try:
            img = np.load(self.path + '/' + "{}_i.npy".format(case_num))
            mask = np.load(self.path + '/' + "{}_m.npy".format(case_num))
            return img, mask

        except (IOError, FileNotFoundError):
            img = self.df.loc[self.df['case_id'] == case_num]['image']
            #print("THIS IS WHAT WE ON")
            img = nib.load(img.values[0])
            affine = img.affine
            img = img.get_fdata()
            img = process(img, size)
            img = np.expand_dims(img, axis=3)
            img = np.expand_dims(img, axis=0)
            mask = self.df.loc[self.df['case_id'] == case_num]['mask']
            mask = nib.load(mask.values[0])
            mask = mask.get_data()
            #mask = resize_image(mask, size, is_mask=True)
            mask_off = mask > 1.5
            mask[
                mask_off] = 1  # This will turn all tumor pixels into kidney pixels
            mask = np.expand_dims(mask, axis=3)
            mask = np.expand_dims(mask, axis=0)

            np.save(self.path + '/' + "{}_i".format(case_num), img)
            np.save(self.path + '/' + "{}_m".format(case_num), mask)
            return img, mask

Example #5

0

Show file

def process(X, size):
    a, b = size
    X_new = []
    for i in range(len(X)):
        X_new.append(preprocessing.process(X[i], [a, b]))
        #preprocessing.make_image(preprocessing.process(X[i],[12,12]),str(i))
    return np.array(X_new)

Example #6

0

Show file

File: main.py Project: ericygu/StocksAndStringsDuo

def kfold_cv(x, y, model):
    nested_train_scores = list()
    nested_test_scores = list()

    outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)
    for train_index, test_index in tqdm(outer_cv.split(x)):
        # split data
        xTrain = x.iloc[train_index]
        xTest = x.iloc[test_index]
        yTrain = y.iloc[train_index]
        yTest = y.iloc[test_index]
        xTrain, yTrain, xTest, yTest = preprocessing.process(
            xTrain, yTrain, xTest, yTest)

        # PCA (number of components chosen such that the amount of variance
        # that needs to be explained is greater than the percentage specified by n_components)
        sklearn_PCA = PCA(n_components=0.95, svd_solver='full')
        xTrain = sklearn_PCA.fit_transform(xTrain)
        xTest = sklearn_PCA.transform(xTest)
        # xTrain, yTrain, xTest, yTest = df_to_numpy(xTrain, yTrain, xTest, yTest)

        md = model.fit(xTrain, yTrain)

        # Train Score
        yHat1 = md.predict(xTrain)
        r2 = r2_score(yTrain, yHat1)
        nested_train_scores.append(r2)

        # Test Score
        yHat2 = md.predict(xTest)
        r2 = r2_score(yTest, yHat2)
        nested_test_scores.append(r2)
    return nested_train_scores, nested_test_scores

Example #7

0

Show file

File: classifier.py Project: Arnaud-Cluzel/NLP_HW2

 def predict(self, datafile):
     """Predicts class labels for the input instances in file 'datafile'
     Returns the list of predicted labels
     """
     lines = self.retrieveData(datafile)
     tokens, data = proc.process(lines)
     self.clf.predict(data.iloc[:, 1:data.shape[1]])

Example #8

0

Show file

    def train(self, trainfile):
        """Trains the classifier model on the training set stored in file trainfile"""
        lines = self.retrieveData(trainfile)
        data, y_train = proc.process(lines)

        vocab = []
        for sent in data['words_in_window']:
            for w in sent:
                if w not in vocab:
                    vocab.append(w)
        vocab_size = len(vocab)

        self.tokenizer = Tokenizer(num_words=vocab_size)
        self.tokenizer.fit_on_texts(data.words_in_window)

        sentiment_tokenized = pd.DataFrame(
            self.tokenizer.texts_to_matrix(data.words_in_window))

        self.clf_tok = Sequential()
        self.clf_tok.add(
            Dense(128, input_shape=(vocab_size, ), activation='softmax'))
        self.clf_tok.add(
            Dense(64, input_shape=(vocab_size, ), activation='relu'))
        self.clf_tok.add(Dense(3, activation='softmax'))
        self.clf_tok.compile(loss='categorical_crossentropy',
                             optimizer='adam',
                             metrics=['accuracy'])

        self.clf_tok.fit(sentiment_tokenized,
                         y_train,
                         epochs=10,
                         batch_size=32)

Example #9

0

Show file

File: evaluate.py Project: mutusfa/tesstraining

def process_file(filepath):
    # tesseract read text
    tiff_image_path = next(preprocessing.process(filepath))
    tessract_results = ocr.process(tiff_image_path)
    tess_text = compile_text(tessract_results)
    yield tess_text
    # ground truth
    yield from clean_ground_truth.process(filepath)

Example #10

0

Show file

File: classifier.py Project: Arnaud-Cluzel/NLP_HW2

    def train(self, trainfile):
        """Trains the classifier model on the training set stored in file trainfile"""
        lines = self.retrieveData(trainfile)
        tokens, data = proc.process(lines)

        parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
        svc = svm.SVC(gamma="scale")
        self.clf = GridSearchCV(svc, parameters, cv=5)
        self.clf.fit(data.iloc[:, 1:data.shape[1]], data.iloc[:, 0])

Example #11

0

Show file

def process(X, size):
    print 'processing....'
    t = time.time()
    a, b = size
    X_new = []
    for i in range(len(X)):
        X_new.append(preprocessing.process(X[i], [a, b]))
        #preprocessing.make_image(preprocessing.process(X[i],[12,12]),str(i))
    print 'processing done in', time.time() - t, 'seconds'
    return np.array(X_new)

Example #12

0

Show file

def get_Xy(path):
    X, y, file = [], [], []
    d = np.load('/'.join(sys.argv[0].split('/')[:-1]) +
                '/acid_data/pka.npy').item()
    for i in os.listdir(path):
        if i[-8:] == '.g16.out':
            X.append(preprocessing.process(coord(i), [100, 100]))
            #preprocessing.make_image(preprocessing.process(coord(i),[100,100]),i.split('.')[0])
            y.append([float(d[i.split('.')[0]])])
            file.append(i.strip().split()[0])
    return np.array(X), np.array(y), file

Example #13

0

Show file

File: main.py Project: AnastasiaSulyagina/NLP

def get_data(input, flag):
    data, labels = [], []
    if flag:
        train_data = preprocessing.process(test_file)
        data, labels = train_data['text'], train_data['labels']
    else:
        with open(train_file) as f:
            gen = chunks.read_chunk(f, "\n")
            # поправлю 40к когда придумаю на что
            for i in range(40000):
                s = next(gen).split('\t')
                data.append(s[-1])
                labels.append(s[-2])
    return data, labels

Example #14

0

Show file

File: main.py Project: ericygu/StocksAndStringsDuo

def nested_cv(x, y, model, p_grid):
    # nested cv method can be condensed with the following code:
    """
    pipeline = Pipeline([('transformer', scalar), ('estimator', clf)])
    cv = KFold(n_splits=4)
    scores = cross_val_score(pipeline, X, y, cv = cv)
    """
    nested_train_scores = list()
    nested_test_scores = list()
    nested_params = list()

    # nested cv
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)
    for train_index, test_index in tqdm(outer_cv.split(x)):
        # split data
        xTrain = x.iloc[train_index]
        xTest = x.iloc[test_index]
        yTrain = y.iloc[train_index]
        yTest = y.iloc[test_index]
        inner_cv = KFold(n_splits=5, shuffle=True, random_state=1)
        xTrain, yTrain, xTest, yTest = preprocessing.process(
            xTrain, yTrain, xTest, yTest)

        # PCA (number of components chosen such that the amount of variance
        # that needs to be explained is greater than the percentage specified by n_components)
        sklearn_PCA = PCA(n_components=0.95, svd_solver='full')
        xTrain = sklearn_PCA.fit_transform(xTrain)
        xTest = sklearn_PCA.transform(xTest)
        # xTrain, yTrain, xTest, yTest = df_to_numpy(xTrain, yTrain, xTest, yTest)

        # Scoring metric is roc_auc_score (precision and recall)
        clf = GridSearchCV(estimator=model,
                           param_grid=p_grid,
                           scoring='r2',
                           cv=inner_cv,
                           refit=True)
        fitter = clf.fit(xTrain, yTrain)
        best_model = fitter.best_estimator_
        nested_params.append(fitter.best_params_)

        # Train Score
        yHat1 = best_model.predict(xTrain)
        r2 = r2_score(yTrain, yHat1)
        nested_train_scores.append(r2)

        # Test Score
        yHat2 = best_model.predict(xTest)
        r2 = r2_score(yTest, yHat2)
        nested_test_scores.append(r2)
    return nested_train_scores, nested_test_scores, nested_params

Example #15

0

Show file

    def predict(self, datafile):
        """Predicts class labels for the input instances in file 'datafile'
        Returns the list of predicted labels
        """
        lines = self.retrieveData(datafile)
        data_eval, y_eval = proc.process(lines)

        x_eval = pd.DataFrame(
            self.tokenizer.texts_to_matrix(data_eval.words_in_window))

        dic = {0: 'negative', 1: 'neutral', 2: 'positive'}
        pred = [
            dic.get(n, n) for n in np.argmax(self.clf_tok.predict(x_eval), 1)
        ]

        return pred

Example #16

0

Show file

File: main.py Project: twistedmove/ice-asr

def main():

    args = parse_args()
    text = args.i.readlines()

    processed_lines = []
    for line in text:
        preprocessed_line = preprocessing.process(line.strip())

        if len(preprocessed_line) != 0:
            processed_line = map_replacement.replace_from_maps(
                preprocessed_line)
            processed_lines.append(processed_line)

    for line in processed_lines:
        args.o.write(line + '\n')

Example #17

0

Show file

    def query(self,
              model_path,
              n_samples_query,
              n_results,
              custom=False,
              weights=False):
        vertices, element_dict, info = read_model(model_path)
        shape = Shape(vertices, element_dict, info)
        shape = process(shape, n_vertices_target=self.n_vertices_target)
        feature_dict = extract_features(shape,
                                        self.n_bins,
                                        n_samples=n_samples_query)
        feature_df = data_dict_parser(feature_dict)
        feature_df, _ = sample_normalizer(
            feature_df,
            *self.sample_normalization_parameters,
            divide_distributions=self.divide_distributions)
        feature_df_numeric = feature_df.select_dtypes(np.number)
        #Make sure columns identical and ordered
        assert list(feature_df_numeric.columns) == list(
            self.df_numeric.columns), "Column mismatch!"
        query_vector = feature_df_numeric.iloc[0, :].values.astype(np.float32)

        if not custom:

            distances, indices = self.faiss_knn.query(query_vector, n_results)
        else:
            distances, indices = self.custom_knn.query(query_vector,
                                                       n_results,
                                                       weights=weights)

        distances = distances.flatten().tolist()  #Flatten batch dimension
        indices = indices.flatten().tolist()
        df_slice = self.df[self.df.index.isin(indices)]
        df_slice['distance'] = df_slice.index.map(
            lambda x: distances[indices.index(x)])

        #Add missing data to query df
        feature_df['file_name'] = str(model_path)
        feature_df['classification'] = 'query_input'
        feature_df['distance'] = 0
        # Put it at top of slice
        df_slice = pd.concat([df_slice, feature_df])
        df_slice = df_slice.sort_values('distance')

        return distances, indices, df_slice

Example #18

0

Show file

File: database_creator_multi.py Project: SamGalanakis/MIR_Pipeline_Project

    def process_subset(self, file_list, apply_processing, n_vertices_target,
                       n_bins, process_index):
        print(f' {process_index} : Starting subset processor!')
        data_subset = {k: [] for k in self.columns + self.col_array}
        for index, file in enumerate(file_list):

            if index % 50 == 0:
                print(f' {process_index} : Is at {index}/{len(file_list)}!')

            vertices, element_dict, info = read_model(Path(file))
            shape = Shape(vertices, element_dict, info)

            if apply_processing:

                shape = process(shape, n_vertices_target=n_vertices_target)

            else:
                shape.make_pyvista_mesh()

            id = os.path.basename(file).split(".")[0].replace("m", "")
            if id in self.classification_dict.keys():
                classification = self.classification_dict[id]

            else:

                classification = None

            data_subset["classification"].append(classification)
            data_subset["file_name"].append(file)

            #Get features
            feature_dict = extract_features(shape,
                                            n_bins=n_bins,
                                            n_samples=self.n_samples)

            #Add them to total data

            for key, val in feature_dict.items():
                data_subset[key].append(val)
        print(f'{process_index} : Finished!')
        return data_subset

Example #19

0

Show file

File: wmdistance.py Project: phaterpekar/fakenewschallange-1

import sys
sys.path.append("../")

from gensim import models

import pandas as pd
import numpy as np

import preprocessing as pp


filename = "../../../data/sample.csv"
data = pd.read_csv(filename, sep=',')


data['header_features'] = data.Headline.apply(lambda x : pp.process(x))
data['content_features'] = data.articleBody.apply(lambda x : pp.process(x))


model = models.Word2Vec.load_word2vec_format('/media/sree/venus/code/word2vec/GoogleNews-vectors-negative300.bin', binary=True)

def sent2vec(words):
    M = []
    for w in words:
        try:
            M.append(model[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum())

Example #20

0

Show file

def preprocess_callback():
    """
    Starts preprocessing
    :return:
    """
    preprocessing.process()

Example #21

0

Show file

y_train = Training_dataframe.iloc[:, 1]

x_test = Testing_dataframe.loc[:, attrib[2:len(attrib)]]
y_test = Testing_dataframe.iloc[:, 1]

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

# merge the 25 news together to form a single signal
merged_x_train = x_train.apply(lambda x: ''.join(str(x.values)), axis=1)
merged_x_test = x_test.apply(lambda x: ''.join(str(x.values)), axis=1)

# ===============
# pre-process
# ===============
merged_x_train = merged_x_train.apply(lambda x: pp.process(x))
merged_x_test = merged_x_test.apply(lambda x: pp.process(x))

# remove stopwords in the training and testing set
train_without_sw = []
test_without_sw = []
train_temporary = list(merged_x_train)
test_temporary = list(merged_x_test)
s = pp.stop_words
for i in train_temporary:
    f = i.split(' ')
    for j in f:
        if j in s:
            f.remove(j)
    s1 = ""
    for k in f:

Example #22

0

Show file

from keras.models import Sequential
from keras.layers import Dense, Activation
import numpy as np
import preprocessing
import pandas as pd
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from numpy import argmax
import re

# Model Template
x_train, y_train, x_val, y_val, x_test, y_test = preprocessing.process(
    "images.npy", "labels.npy")
# val = int(raw_input("Max Depth: "))
model = tree.DecisionTreeClassifier(max_depth=10)

#4 different types of feature extraction


#1 avg pixel values for each number
def getAvgPixelIntensity(x_set):
    pixelIntensity = 0
    pics = []
    #print(x_set.shape)
    for picture in x_set:
        sum = 0
        for pixel in picture:
            sum += pixel

Example #23

0

Show file

def data():
    start_train = '2008-08-08'
    end_train = '2014-12-31'
    start_val = '2015-01-02'
    end_val = '2016-07-01'
    max_sequence_length = 110
    vocab_size = 3000
    # read csv file
    DJIA = pd.read_csv("Combined_News_DJIA.csv",
                       usecols=[
                           'Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4',
                           'Top5', 'Top6', 'Top7', 'Top8', 'Top9', 'Top10',
                           'Top11', 'Top12', 'Top13', 'Top14', 'Top15',
                           'Top16', 'Top17', 'Top18', 'Top19', 'Top20',
                           'Top21', 'Top22', 'Top23', 'Top24', 'Top25'
                       ])

    # create training and testing dataframe on 80 % and 20 % respectively
    Training_dataframe = DJIA[(DJIA['Date'] >= start_train)
                              & (DJIA['Date'] <= end_train)]
    Testing_dataframe = DJIA[(DJIA['Date'] >= start_val)
                             & (DJIA['Date'] <= end_val)]

    attrib = DJIA.columns.values

    x_train = Training_dataframe.loc[:, attrib[2:len(attrib)]]
    y_train = Training_dataframe.iloc[:, 1]

    x_test = Testing_dataframe.loc[:, attrib[2:len(attrib)]]
    y_test = Testing_dataframe.iloc[:, 1]

    y_train = to_categorical(y_train)
    y_test = to_categorical(y_test)

    # merge the 25 news together to form a single signal
    merged_x_train = x_train.apply(lambda x: ''.join(str(x.values)), axis=1)
    merged_x_test = x_test.apply(lambda x: ''.join(str(x.values)), axis=1)

    # ===============
    # pre-process
    # ===============
    merged_x_train = merged_x_train.apply(lambda x: pp.process(x))
    merged_x_test = merged_x_test.apply(lambda x: pp.process(x))

    #merged_x_train = merged_x_train.apply(lambda x: pp.lemmanouns(pp.lemmaverbs(pp.lemmaadjectives(x))))
    #merged_x_test = merged_x_test.apply(lambda x: pp.lemmanouns(pp.lemmaverbs(pp.lemmaadjectives(x))))

    #merged_x_train = merged_x_train.apply(lambda x: pp.stemmer(x))
    #merged_x_test = merged_x_test.apply(lambda x: pp.stemmer(x))

    # remove stopwords in the training and testing set
    train_without_sw = []
    test_without_sw = []
    train_temporary = list(merged_x_train)
    test_temporary = list(merged_x_test)
    s = pp.stop_words
    for i in train_temporary:
        f = i.split(' ')
        for j in f:
            if j in s:
                f.remove(j)
        s1 = ""
        for k in f:
            s1 += k + " "
        train_without_sw.append(s1)
    merged_x_train = train_without_sw

    for i in test_temporary:
        f = i.split(' ')
        for j in f:
            if j in s:
                f.remove(j)
        s1 = ""
        for k in f:
            s1 += k + " "
        test_without_sw.append(s1)
    merged_x_test = test_without_sw

    # tokenize and create sequences
    tokenizer = Tokenizer(num_words=vocab_size)
    tokenizer.fit_on_texts(merged_x_train)
    x_train_sequence = tokenizer.texts_to_sequences(merged_x_train)
    x_test_sequence = tokenizer.texts_to_sequences(merged_x_test)

    word_index = tokenizer.word_index
    input_dim = len(word_index) + 1
    print('Found %s unique tokens.' % len(word_index))

    x_train_sequence = pad_sequences(x_train_sequence,
                                     maxlen=max_sequence_length)
    x_test_sequence = pad_sequences(x_test_sequence,
                                    maxlen=max_sequence_length)

    print('Shape of training tensor:', x_train_sequence.shape)
    print(x_train_sequence)
    print('Shape of testing tensor:', x_test_sequence.shape)
    print(x_test_sequence)
    """
    Data providing function:

    This function is separated from create_model() so that hyperopt
    won't reload data for each evaluation run.
    """
    return x_train_sequence, y_train, x_test_sequence, y_test

Example #24

0

Show file

File: main.py Project: salceson/PJN

                precision, recall, f1 = calculate_quality(_OUTPUT_PATTERN % metric, _REFERENCE_FILENAME)
                print("Metric %s:" % metric)
                print("\tPrecision: %f, recall: %f, f1: %f" % (precision, recall, f1))
            except:
                pass
        time2 = time()
        print("Run for %f s." % (time2 - time1))
    else:
        metric_txt = action
        metric = dice_metric if action == 'dice' else cosine_metric if action == 'cosine' else lcs_metric
        print("Preprocessing data...")
        print("Input: %s" % _INPUT_FILENAME)
        counter = 0
        preprocessed = {}
        result = {}
        with open(_INPUT_FILENAME) as input:
            for line in input:
                preprocessed_line = process(line)
                preprocessed[line] = preprocessed_line
                if _DEBUG and counter % 50 == 0:
                    print("%s => %s" % (line, preprocessed_line))
                counter += 1
        print("Clustering...")
        clusters = cluster(preprocessed, metric, _THRESHOLDS[metric_txt], _DEBUG)
        for line, preprocessed_line in preprocessed.items():
            result[line] = clusters[preprocessed_line]
        print("Writing result...")
        write_result(result, _OUTPUT_PATTERN % metric_txt)
        time2 = time()
        print("Run for %f s." % (time2 - time1))

Example #25

0

Show file

import matplotlib
matplotlib.use('Agg')
import mpld3
from pandas import DataFrame
import seaborn as sns
import matplotlib.pyplot as plt
import pylab as pl
import preprocessing
import numpy as np
import plots.pie as pie

print("Starting")

imgs = []

data = preprocessing.process()

dispXSpending = data.groupby(
    [data['SO_DISPOSITIVO']])['VALOR_PRODUTOS']

info = dispXSpending.sum()

imgs.append(pie.plot(info))

########################################
imgs.append(plt.figure())
single = [[0 for _ in range(7)] for _ in range(24)]
for index, row in data.iterrows():
    single[row['HORA_PEDIDO']][row['DIA_PEDIDO']] += 1

df = DataFrame(single, index=range(0, 24, 1), columns=range(0, 7, 1))

Example #26

0

Show file

#data
from data_downloader import downloader
from preprocessing import process

#Model
from sklearn_crfsuite import CRF

#Evalulation
from sklearn_crfsuite.metrics import flat_classification_report
from sklearn_crfsuite.metrics import flat_f1_score
from sklearn_crfsuite.metrics import flat_accuracy_score

data = downloader()

X_train, Y_train = process(data)

crf4 = CRF(algorithm='lbfgs',
           max_iterations=20,
           c1=0.1,
           c2=0.2,
           all_possible_transitions=False)

#training model
crf4.fit(X=X_train, y=Y_train)

#generate predictions
pred = crf4.predict(X_train)

#generate report on entire model
report = flat_classification_report(y_pred=pred, y_true=Y_train)
print(report)

Example #27

0

Show file

File: graph.py Project: pjavia/Machine-Learning

	summary_hist_b = tf.summary.histogram('W', weights_of_model[1])
	summary_loss = tf.summary.scalar('Loss', loss)
	#summary_train_acc = tf.summary.scalar('Training Accuracy', train_accuracy)
	summary_op = tf.summary.merge_all() 



	init = tf.global_variables_initializer()


with tf.Session(graph=linear) as sess:

	sess.run(init)
	train_writer = tf.summary.FileWriter('summary_directory', sess.graph)

	training_data, training_labels, validation_data, validation_labels, testing_data, testing_labels = process()

	
	
	step = 0
	for epoch in range(1, 2):

		for i in range(0, len(training_data) - mini_batch, mini_batch):
			x_feed = []
			y_feed = []
			for j in range(i, i + mini_batch):
				x_feed.append(training_data[j])
				y_feed.append(training_labels[j])

			loss_, _, summary_full = sess.run([loss, train_opt, summary_op], feed_dict={train_data: x_feed, label_data: y_feed, batch_size: 100.0/float(len(x_feed))}) #, keep_prob: 0.5})
			step += 1

Example #28

0

Show file

File: main.py Project: phaterpekar/fakenewschallange-1

import numpy as np

import numpy as np
from sklearn.decomposition import PCA
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale

bodies = "../../data/train_bodies.csv"
stances = "../../data/train_stances.csv"

content = pd.read_csv(bodies, sep=",")
headlines = pd.read_csv(stances, sep=",")

## generate necessary token features for dnews heading and news body
content['content_tokens'] = content.articleBody.apply(lambda x: pp.process(x))
headlines['headline_tokens'] = headlines.Headline.apply(lambda x: pp.process(x))

# ## Begin sentence embedding
header_vectors = np.zeros((headlines.shape[0], 300))
for i, q in enumerate(headlines.headline_tokens.values):
    header_vectors[i, :] = encoding.tovector(q)

# ## create the content vector
content_vectors = np.zeros((content.shape[0], 300))
for i, q in enumerate(content.content_tokens.values):
    content_vectors[i, :] = encoding.tovector(q)

header_series = pd.Series(header_vectors.tolist())
headlines['headline_vector'] = header_series.values

Example #29

0

Show file

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output  # for callbacks

import preprocessing
from plots import Plot

df = preprocessing.process()
plot = Plot(df)

# Launch the application:
app = dash.Dash(__name__)

app.layout = html.Div(
    children=[
        # search and table
        html.Div(children=[
            dcc.Input(id="search_input",
                      placeholder='Enter a value...',
                      type='text',
                      value=''),
            html.Div(dcc.Graph(id="table")),
        ]),

        # row of 2 barcharts
        html.Div(children=[
            html.Div(children=[
                html.Div(dcc.Graph(id="overall_bc")),
                dcc.Slider(id="overall_slider",
                           marks={i: str(i)

Example #30

0

Show file

File: bag_of_words.py Project: AnastasiaSulyagina/NLP

# -*- coding: utf8 -*-
import pandas as pd
import numpy as np
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import linear_model

import preprocessing


train_file, test_file = sys.argv[1], sys.argv[2]

data = preprocessing.process(train_file)

vectorizer = CountVectorizer(analyzer = "word", max_features = 2000)
train_data_features = vectorizer.fit_transform(data['text']).toarray()

def show_word_frequencies(out_file, print_data):
    vectorizer = CountVectorizer(analyzer = "word", max_features = 2000)
    data_features = vectorizer.fit_transform(print_data['text']).toarray()
    words = vectorizer.get_feature_names()
    frequencies = np.sum(data_features, axis=0)
    with open(out_file, "w+") as f:
        for fr, word in sorted(zip(frequencies, words), reverse=True):
            f.write(str(fr) + word + '\n')

data[data['label'] == '1'].to_csv('bad_vocab.txt', sep='\t', encoding='utf-8')
data[data['label'] == '0'].to_csv('good_vocab.txt', sep='\t', encoding='utf-8')
show_word_frequencies("bad_features.txt", data[data['label'] == '1'])
show_word_frequencies("good_features.txt", data[data['label'] == '0'])

Example #31

0

Show file

File: svm.py Project: pjavia/Machine-Learning

from sklearn import svm
from preprocessing import process

X, Y, v_x, v_y, t_x, t_y = process()

total = len(v_y)

clf = svm.SVC()
clf.fit(X, Y)
v_y_ = clf.predict(v_x)
p = np.sum(v_y_ == np.array(v_y))
accuracy = p / float(total)
print accuracy