예제 #1
0
def validate(training_settings_pickle, test_data_file, output_dir):
    # Parse data
    loader = BiocLoader()
    data = loader.parse(test_data_file)

    # Run validation
    pipeline = Pipeline()
    result = pipeline.validate(data, training_settings_pickle)

    #Save results
    output_file = tempfile.mkstemp(prefix=basename(test_data_file),
                                   suffix=".xml")[1]
    loader.dump(result, filename=output_file)
예제 #2
0
def main(argv):
    # construct our pipeline list reading from command line args
    # still need to figure out best way to pass parameters on command
    # line

    global verbose
    global norm
    split = None

    transforms = []
    for arg in argv[0].split(","):
        if arg == "toke":
            transforms.append(Tokenizer())
        elif arg == "stem":
            transforms.append(Stemmer())
        elif arg == "stem-porter":
            transforms.append(Stemmer(mode='Porter'))
        elif arg == "stem-lancaster":
            transforms.append(Stemmer(mode='Lancaster'))
        elif arg == "stem-lemmatize":
            transforms.append(Stemmer(mode='Lemmatize'))
        elif arg == "vect":
            transforms.append(Vectorizer())
        elif arg == "vect-tfidf":
            transforms.append(Vectorizer(mode='TFIDF'))
        elif arg == "vect-count":
            transforms.append(Vectorizer(mode='Count'))
        elif arg == "vect-lda-2":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=2))
        elif arg == "vect-lda-10":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=10))
        elif arg == "vect-lda-25":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=25))
        elif arg == "vect-lda-50":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=50))
        elif arg == "vect-lda-150":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=150))
        elif arg == "vect-lda-500":
            transforms.append(Vectorizer(mode='LDA', ldaSplits=500))
        elif arg == "svm":
            transforms.append(Model('svm'))
        elif arg == "nb":
            transforms.append(Model('nb'))
        elif arg == "lr":
            transforms.append(Model('lr'))
        elif arg == "nn":
            transforms.append(
                Model('nn', inputDim=10000)
            )  #Configured for Vectorizer with vectors limited to 1000
        elif arg == "norm":
            norm = True
        elif arg == "no-verb":
            verbose = False
        elif arg == "split-sentences":
            split = "sentences"
        elif arg == "nn-optim":
            # Memory optimized neural network.
            transforms.append(
                OptimNN(vecMode='TFIDF', epochs=2, batchSize=2048))
        else:
            raise Exception(f"Invalid transformer {arg}")
    pipe = Pipeline(transforms, norm=norm)

    # read our data (hardcoded for now)
    df0 = pd.read_pickle(
        "./data/democrat_comments.pkl")  #.sample(frac = 0.05) # DEBUG ONLY
    df1 = pd.read_pickle(
        "./data/republican_comments.pkl")  #.sample(frac = 0.05) # DEBUG ONLY

    if (split is not None):
        if (verbose):
            print('Splitting Democrat comments')
        df0 = splitRows(df0, mode=split, verbose=verbose)

        if (verbose):
            print('Splitting Republican comments')
        df1 = splitRows(df1, mode=split, verbose=verbose)

    label0 = df0.subreddit.iloc[0]
    label1 = df1.subreddit.iloc[0]

    # concatenate and clean our data
    X = pd.concat([df0.body, df1.body], ignore_index=True)
    y = pd.concat([df0.subreddit, df1.subreddit],
                  ignore_index=True).replace(to_replace=[label0, label1],
                                             value=[0, 1])

    # split into training and test
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    if (verbose):
        print('Applying Transforms and Training Model')
        print('Train Data:', train_path)
        print('Test Data:', test_path)
        print('Transforms:', argv[0])
    # fit our data
    pipe.fit_transform(X_train, y_train)

    # do the prediction
    y_pred = pipe.predict(X_test)
    results = pipe.validate(y_pred, y_test, True, True)

    # get most suprising misclassifications for class 0
    print("Most suprising texts misclassified as class 0")
    idx_list = heapq.nlargest(5, results[2][0], key=lambda x: x[1])
    for i, (idx, prob) in enumerate(idx_list):
        print(f"{i}) probability class 1 = {prob}\n{X_test[idx]}, \n")

    # get most suprising misclassifications for class 1
    print("Most suprising texts misclassified as class 1")
    idx_list = heapq.nlargest(5, results[2][1], key=lambda x: x[1])
    for i, (idx, prob) in enumerate(idx_list):
        print(f"{i}) probability class 0 = {prob}\n{X_test[idx]}\n")