def test_process_labels_target_five_labels(self):
        input = cleandoc('''
            jump :label1 notEqual @unit null
            label1:
            label2:
            jump :label2 notEqual @unit null
            ubind @mono
            label3:
            jump :label5 notEqual @unit null
            label4:
            end
            jump :label4 notEqual @unit null
            label5:
            jump :label3 notEqual @unit null
        ''')
        expected = cleandoc('''
            jump 2 notEqual @unit null
            jump 2 notEqual @unit null
            ubind @mono
            jump 7 notEqual @unit null
            end
            jump 5 notEqual @unit null
            jump 4 notEqual @unit null
        ''')

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
Beispiel #2
0
 def __init__(self, args):
     self.args = args
     self.args.device = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     self.model_param_fname = os.path.join(
         "weights",
         f"best_model_{self.args.pretrained_model}_{self.args.cnn_mode}.pt")
     self.preprocessor = Preprocessor(self.args)
    def test_process_labels(self):
        input = CUSTOM_SIMPLE
        expected = STANDARD_SIMPLE

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
    def test_process_labels_target_does_not_exist(self):
        input = cleandoc('''
            jump :label1 notEqual @unit null
            ubind @mono
            end
        ''')

        uut = Preprocessor(input)

        with self.assertRaises(ValueError):
            uut.process_labels()
    def test_process_labels_bad_target_error_line_number_simple_program(self):
        input = cleandoc('''
            jump :label1 notEqual @unit null
            ubind @mono
            end
        ''')

        uut = Preprocessor(input)

        with self.assertRaisesRegex(ValueError, r'Line 1'):
            uut.process_labels()
Beispiel #6
0
def preprocess(p_src: str) -> str:
    global tok_debug
    global parse_debug
    global env_debug

    pps = Preprocessor()
    src, flags = pps.scan(p_src)

    tok_debug = flags["tok_debug"]
    parse_debug = flags["parse_debug"]
    env_debug = flags["env_debug"]

    return src
    def test_process_labels_unreferenced_target(self):
        input = cleandoc('''
            jump 3 notEqual @unit null
            ubind @mono
            label1:
            end
        ''')
        expected = STANDARD_SIMPLE

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
Beispiel #8
0
 def test_color2bw(self):
     '''
     Loads the main input image to convert it to scale of gray.
     '''
     img_test = cv2.imread('test/img/img.tif')
     dimensions = Preprocessor.color2bw(self, img_test)
     self.assertEqual(np.shape(dimensions), (5000, 5000))
Beispiel #9
0
    def test_scale_images(self):
        '''
        Loads the 4 pieces of the total image and with dimensions (2500, 2500),
        eliminates one dimension because imread loads with 3 dimensions even if it is
        in grayscale.
        '''

        # Tiene que leer las 4 imágenes que entran
        img_test_1 = cv2.imread('test/img/test_crop_image_1.png',
                                cv2.IMREAD_GRAYSCALE)
        img_test_2 = cv2.imread('test/img/test_crop_image_2.png',
                                cv2.IMREAD_GRAYSCALE)
        img_test_3 = cv2.imread('test/img/test_crop_image_3.png',
                                cv2.IMREAD_GRAYSCALE)
        img_test_4 = cv2.imread('test/img/test_crop_image_4.png',
                                cv2.IMREAD_GRAYSCALE)

        img_test_list = [img_test_1, img_test_2, img_test_3, img_test_4]

        scaled = Preprocessor.scale_images(self, img_test_list)

        self.assertEqual(len(scaled), 4)
        self.assertEqual(np.shape(scaled[0]), (512, 512))
        self.assertEqual(np.shape(scaled[1]), (512, 512))
        self.assertEqual(np.shape(scaled[2]), (512, 512))
        self.assertEqual(np.shape(scaled[3]), (512, 512))
Beispiel #10
0
def main(config):
    scanner = Scanner(config)
    preprocessor = Preprocessor(config)
    utils = Utils(config)

    scanner.scan(front=True)
    if config.manual_duplex:
        print('rotate pages and press enter')
        input()
        scanner.scan(front=False)
    pages = scanner.get_pages()
    preprocessor.process(pages)
    exporter = Exporter(config)
    exporter.save_as_pdf(pages)
    if utils.show_preview():
        exporter.upload_doc()
    utils.clean_up(pages)
    def test_process_labels_consecutive_targets(self):
        input = cleandoc('''
            jump :label2 notEqual @unit null
            ubind @mono
            label1:
            label2:
            label3:
            end
        ''')
        expected = STANDARD_SIMPLE

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
Beispiel #12
0
class Predictor:
    def __init__(self, args):
        self.args = args
        self.args.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.model_param_fname = os.path.join(
            "weights",
            f"best_model_{self.args.pretrained_model}_{self.args.cnn_mode}.pt")
        self.preprocessor = Preprocessor(self.args)

    def set_seed(self):
        random.seed(self.args.seed)
        np.random.seed(self.args.seed)
        torch.manual_seed(self.args.seed)
        # torch.backend.cudnn.deterministic = True

    def get_vocab_info(self):
        self.vocab_size = len(self.TEXT.vocab)
        self.pad_index = self.TEXT.vocab.stoi['<pad>']
        self.unk_index = self.TEXT.vocab.stoi['<unk>']

        print(
            f"input dim: {self.vocab_size}\npad index: {self.pad_index}\nunk index: {self.unk_index}"
        )

    def run(self):
        self.set_seed()

        reviews, word2index = self.preprocessor.build_data()

        processed_csv_fname = os.path.join(self.args.data_path,
                                           self.args.processed_csv)
        self.TEXT, self.train_iterator, self.valid_iterator = data_load_without_cv(
            processed_csv_fname, self.args)
        self.get_vocab_info()
        word2vec_index, word2vec_vector = load_pretrained_word2vec(
            self.args.data_path, self.TEXT)

        self.TEXT.vocab.set_vectors(
            word2vec_index,
            torch.from_numpy(word2vec_vector).float().to(self.args.device),
            self.args.embedding_dim)

        self.model = PolarCNN(self.vocab_size, self.pad_index,
                              self.args).to(self.args.device)
        self.critierion = torch.nn.BCEWithLogitsLoss().to(self.args.device)

        # for prediction
        if os.path.isfile(self.model_param_fname):
            self.model.load_state_dict(torch.load(self.model_param_fname))
            probability, predicted_label = predict(self.TEXT, self.args,
                                                   self.model)
            print_predict_log(self.args, probability, predicted_label)
        else:
            print(
                f"model parameter file {self.model_param_fname} was not exist")
            print(f"for sentence prediction, please train your model first")
    def test_process_labels_bad_target_error_line_number_between_other_targets(self):
        input = cleandoc('''
            jump :label1 notEqual @unit null
            label1:
            label2:
            jump :label2 notEqual @unit null
            ubind @mono
            label3:
            jump :label5 notEqual @unit null
            end
            jump :label4 notEqual @unit null
            label5:
            jump :label3 notEqual @unit null
        ''')

        uut = Preprocessor(input)

        with self.assertRaisesRegex(ValueError, r'Line 9'):
            uut.process_labels()
Beispiel #14
0
def preprocess_pipeline():
    """Read in raw lexicon and create preprocessed version."""
    for language, n in config.LANGUAGES_N:
        print("Preprocessing lexica for: {language}".format(language=language))
        if not os.path.exists("data/processed/{lan}".format(lan=language)):
            print("Creating directory: data/processed/{lan}".format(
                lan=language))
            os.mkdir("data/processed/{lan}".format(lan=language))
        if not os.path.exists(
                "data/processed/{lan}/reals".format(lan=language)):
            print("Creating directory: data/processed/{lan}/reals".format(
                lan=language))
            os.mkdir("data/processed/{lan}/reals".format(lan=language))
        config_dict = get_config_dict(config, language)
        ## Reset n according to language
        config_dict['n'] = n
        preprocessor = Preprocessor(**config_dict)
        info_for_generation = preprocessor.preprocess_lexicon()
        print("Now getting minimal pairs")
        preprocessor.get_minimal_pairs()
    def test__tokenise_label_len_matches_tokens(self):
        # Arrange
        x = [
            "Comparison with", "alkaline phosphatases", "and", " ", " ",
            "5-nucleotidase"
        ]
        y = ["o", "s", "o", "o", "o", "s"]

        tokensier = MagicMock()
        tokensier.tokenize.side_effect = lambda z: list(
            filter(lambda y: y != "", z.split(" ")))

        label_mapper = MagicMock()
        label_mapper.entity_labels = ["s"]
        label_mapper.continuation_symbol = {"s": "sc"}
        label_mapper.other_label = "o"

        sut = Preprocessor(max_feature_len=5,
                           tokeniser=tokensier,
                           label_mapper=label_mapper)
        sut._x = x
        sut._y = y

        # Act
        sut._tokenise()

        # Assert
        self.assertEqual(len(sut._x), len(sut._y))
    def test__tokenise(self):
        # Arrange
        x = [
            "Comparison with", "alkaline phosphatases", "and", "5-nucleotidase"
        ]
        y = ["o", "s", "o", "s"]

        expected_x = [
            "Comparison", "with", "alkaline", "phosphatases", "and",
            "5-nucleotidase"
        ]
        expected_y = ["o", "o", "s", "sc", "o", "s"]

        tokensier = MagicMock()
        tokensier.tokenize.side_effect = lambda x: x.split(" ")

        label_mapper = MagicMock()
        label_mapper.entity_labels = ["s"]
        label_mapper.continuation_symbol = {"s": "sc"}
        label_mapper.other_label = "o"

        sut = Preprocessor(max_feature_len=5,
                           tokeniser=tokensier,
                           label_mapper=label_mapper)
        sut._x = x
        sut._y = y

        # Act
        sut._tokenise()

        # Assert
        self.assertSequenceEqual(expected_x, sut._x)
        self.assertSequenceEqual(expected_y, sut._y)
    def test__to_label_to_index(self):
        """Make sure label to index for y works"""
        # Arrange
        x = [
            "[CLS]"
            "Comparison", "with", "alkaline", "phosphatases", "and",
            "5-nucleotidase", "[PAD]"
        ]
        y = ["[PAD]", "o", "o", "s", "sc", "o", "s", "[PAD]"]
        labels = ["s", "sc", "o"]

        fake_labels = ["[PAD]"] + ["s", "sc", "o"]

        expected_y = [fake_labels.index(x) - 1 for x in y]

        tokensier = MagicMock()
        tokensier.tokenize.side_effect = lambda x: x.split(" ")

        label_mapper = MagicMock()
        label_mapper.entity_labels = ["s"]
        label_mapper.continuation_symbol = {"s": "sc"}
        label_mapper.other_label = "o"
        label_mapper.label_to_index = lambda x: labels.index(x)

        sut = Preprocessor(max_feature_len=5,
                           tokeniser=tokensier,
                           label_mapper=label_mapper)
        sut._x = x
        sut._y = y

        # Act
        sut._to_label_index()

        # Assert
        self.assertSequenceEqual(expected_y, sut._y)
def main():
    train = pd.read_csv('../data/train.csv')
    test = pd.read_csv('../data/test.csv')

    X = train[selected_fields]

    X[encoding_fields].astype('category', copy=False)
    y = train['SalePrice']

    preprocessor = Preprocessor()
    preprocessor.train(X)
    X = preprocessor.transform(X)
    y = np.array(y)

    clf = RandomForestRegressor()
    score = cross_val_score(clf, X, y, cv=5)
    print(score)

    X_test = test[selected_fields]
    X_test[encoding_fields].astype('category', copy=False)
    X_test = preprocessor.transform(X_test)
    clf.fit(X, y)
    y_pred = clf.predict(X_test)

    result = pd.DataFrame({
        'SalePrice': y_pred
    }, index=test['Id'])
    result.to_csv('output.csv')
    def test_process_labels_target_two_labels(self):
        input = cleandoc('''
            label1:
            jump :label2 notEqual @unit null
            ubind @mono
            label2:
            jump :label1 notEqual @unit null
            end
        ''')
        expected = cleandoc('''
            jump 3 notEqual @unit null
            ubind @mono
            jump 1 notEqual @unit null
            end
        ''')

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
    def test_process_labels_empty_lines(self):
        input = cleandoc('''
            jump :label1 notEqual @unit null
            ubind @mono

            label1:

            end
        ''')
        expected = cleandoc('''
            jump 4 notEqual @unit null
            ubind @mono


            end
        ''')

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
Beispiel #21
0
    def _get_words(path):
        reader = EpubReader(path)
        book_text = reader.get_text()
        book_text = Preprocessor.process(book_text)

        words_to_occurrences = dict()
        for word in book_text:
            if word not in words_to_occurrences:
                words_to_occurrences[word] = Word(word)

            words_to_occurrences[word].add_occurrence()

        words_and_occurrences = list(words_to_occurrences.values())
        words_and_occurrences.sort(key=lambda elem: elem.occurrences,
                                   reverse=True)

        return words_and_occurrences
Beispiel #22
0
    def test_crop_image(self):
        '''
        Load an image from the test image library into 
        grey scale and pass it to the crop_image function
        which returns an array composed of 4 images of (2500, 2500).

        The test checks the result by validating the dimensions and finally
        that the array is composed of 4 images.
        '''

        img_test = cv2.imread('test/img/test_color_gray.png',
                              cv2.IMREAD_GRAYSCALE)
        cropped = Preprocessor.crop_image(self, img_test)

        self.assertEqual(np.shape(cropped[0]), (2500, 2500))
        self.assertEqual(np.shape(cropped[1]), (2500, 2500))
        self.assertEqual(np.shape(cropped[2]), (2500, 2500))
        self.assertEqual(np.shape(cropped[3]), (2500, 2500))

        self.assertEqual(len(cropped), 4)
Beispiel #23
0
def heldout_surprisal():
    """Calculate heldout surprisal"""
    for language, n in config.LANGUAGES_N:
        print("Calculating held-out surprisal for: {language}".format(
            language=language))

        ## Get config dict
        config_dict = get_config_dict(config, language)
        PHON_COLUMN = config_dict['phon_column']

        ## Load real lexicon
        LOAD_PATH = "data/processed/{lan1}/reals/{lan2}_with_mps_{n}phone.csv".format(
            lan1=language, lan2=language, n=n)
        SAVE_PATH = "data/processed/{lan1}/reals/{lan2}_with_mps_{n}phone_holdout.csv".format(
            lan1=language, lan2=language, n=n)
        df_lexicon = pd.read_csv(LOAD_PATH)
        print(len(df_lexicon))

        # Get heldout surprisal
        print("Calculating heldout surprisal...")
        NUM_FOLDS = 1000
        print("Number of folds: {x}".format(x=NUM_FOLDS))
        df_real_heldout = Preprocessor.calculate_heldout_surprisal(
            df_lexicon[PHON_COLUMN].values, n=n, num_folds=NUM_FOLDS)
        df_real_heldout[PHON_COLUMN] = df_real_heldout['word']
        df_real_heldout = df_real_heldout[[
            PHON_COLUMN, 'heldout_log_prob', 'heldout_surprisal'
        ]]
        print(len(df_real_heldout))

        # Merge with real processed lexicon
        df_merged = pd.merge(df_lexicon, df_real_heldout, on=PHON_COLUMN)
        print(len(df_merged))

        print("Saving to: {path}".format(path=SAVE_PATH))
        df_merged.to_csv(SAVE_PATH)
    def test__call__no_label_runs_without_exceptions(self):
        x = [
            "Comparison with", "alkaline phosphatases", "and", "5-nucleotidase"
        ]
        labels = ["s", "sc", "o"]

        tokensier = MagicMock()
        tokensier.tokenize.side_effect = lambda x: x.split(" ")

        label_mapper = MagicMock()
        label_mapper.entity_labels = ["s"]
        label_mapper.continuation_symbol = {"s": "sc"}
        label_mapper.other_label = "o"
        label_mapper.label_to_index = lambda x: labels.index(x)

        sut = Preprocessor(max_feature_len=5,
                           tokeniser=tokensier,
                           label_mapper=label_mapper)

        # Act
        x, y = sut(x)

        # Assert
        self.assertIsNone(y)
Beispiel #25
0
from sklearn.linear_model import Perceptron
from src.preprocessor import Preprocessor
from random import sample
from src.classifier import Classifier
import src.conf as conf
import numpy as np


preprocess = False
if preprocess:

    dataset_pickle = open(conf.project_path + 'data\dataset_cleared.pickle', 'rb')
    dataset = pickle.load(dataset_pickle)
    dataset_pickle.close()

    preprocessor = Preprocessor()

    dataset = list(map(lambda x: (preprocessor.preprocess(x[0]), x[1]), dataset))  # preprocess dataset
    preprocessed_dataset = open(conf.project_path + 'data\dataset_preprocessed.pickle', 'wb')
    pickle.dump(dataset, preprocessed_dataset)

dataset = pickle.load(open(conf.project_path + 'data\dataset_preprocessed.pickle', 'rb'))

dataset = [x for x in dataset if len(x[0].split()) > 0]

dataset = list(set(dataset))

classifiers = [(SVC(kernel='rbf', C=2.9, gamma=1), 'svm_rbf'),
               (SVC(kernel='linear'), 'svm_linear')]
               # (KNeighborsClassifier(), 'knn'),
               # (MultinomialNB(), 'naive_bayes'),
Beispiel #26
0
def preprocess_lexicon(language):
    """Preprocess lexicon."""
    config_dict = get_config_dict(config, language)
    preprocessor = Preprocessor(**config_dict)
    info_for_generation = preprocessor.preprocess_lexicon()
    return info_for_generation
Beispiel #27
0
import argparse
import sys

from src.preprocessor import Preprocessor


def parse_args():
    '''Parse and return command line arguments.'''
    parser = argparse.ArgumentParser(description='Mindustry Logic Preprocessor')
    parser.add_argument('--infile', '-f', nargs='?', type=argparse.FileType('r'),
                        help='Path to input file', default=sys.stdin)
    parser.add_argument('--outfile', '-o', nargs='?', type=argparse.FileType('w'),
                        help='Path to output file', default=sys.stdout)

    return parser.parse_args()


if __name__ == '__main__':
    cli = parse_args()

    with cli.infile as input:
        processor = Preprocessor(input.read())

    try:
        processor.process_labels()
    except Exception as exc:
        sys.stderr.write(str(exc))

    with cli.outfile as output:
        output.write(processor.get_result())
Beispiel #28
0
from src.preprocessor import Preprocessor

if __name__ == '__main__':
    preprocessor = Preprocessor()
    preprocessor.run(metadata_filename='./data/metadata.feather')
    preprocessor.save(tfidf_weight_filename='./data/tfidf.json', idf_filename='./data/idf.json')
Beispiel #29
0
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from src.preprocessor import Preprocessor
from random import sample
import numpy as np

import src.conf as conf
import pickle

test = open(conf.project_path + '/data/SemEval2016-task4-test.subtask-BD.txt',
            'r').readlines()
gold = open(
    conf.project_path + '/data/SemEval2016_task4_subtaskB_test_gold.txt',
    'r').readlines()

p = Preprocessor()

preprocess = False

if preprocess:
    dataset = [p.preprocess(t.split('\t')[3].replace('\n', '')) for t in test]
    pickle.dump(
        dataset, open(conf.project_path + 'data/test_preprocessed.pickle',
                      'wb'))
else:
    dataset = pickle.load(
        open(conf.project_path + 'data/test_preprocessed.pickle', 'rb'))

labels = [t.split('\t')[2].replace('\n', '') for t in gold]

test_dataset = [(example, label) for example, label in zip(dataset, labels)]
Beispiel #30
0
def main():

    if len(sys.argv) < 2:
        Helper.print("Too few arguments given. Use -help to get help.")
        return

    if len(sys.argv) > 3:
        Helper.print("Too many arguments given. Use -help to get help.")
        return

    input_path_string = sys.argv[1]

    if input_path_string == "-help":
        Helper.print("Following arguments are required:")
        Helper.print("[0] absolute path to source folder")
        Helper.print("[1] absolute path to output folder")
        Helper.print("Example: \"C:\\temp\\data\\input\" \"C:\\temp\\data\\output\"")
        return

    output_path_string = sys.argv[2]

    Helper.blockPrint()
    # declare folder paths
    root_dir = os.path.abspath(os.sep)
    root = os.path.join(root_dir, "temp", "TabExImg")
    FileHelper.createPathIfNotExisting(root)
    input_path = input_path_string  # multiple scanned PDFs
    FileHelper.createPathIfNotExisting(input_path)
    pdf_images_path = os.path.join(root, "01_pdf_images")  # multiple scanned PDFs
    FileHelper.createPathIfNotExisting(pdf_images_path)
    preprocessed_images_path = os.path.join(root, "02_preprocessed_images")  # folder per pdf | preprocessed images
    FileHelper.createPathIfNotExisting(preprocessed_images_path)
    treated_pdfs_path = os.path.join(root, "03_treated_pdfs")
    FileHelper.createPathIfNotExisting(treated_pdfs_path)
    output_path = output_path_string
    FileHelper.createPathIfNotExisting(output_path)
    output_boundaries_path = os.path.join(output_path, "excel")
    FileHelper.createPathIfNotExisting(output_boundaries_path)
    output_pdf_path = os.path.join(output_path, "pdf")
    FileHelper.createPathIfNotExisting(output_pdf_path)

    # delete eventually still existing old files (01_pdf_images, 02_preprocessed_images, 03_treated_pdfs)
    Helper.print("Precautionary delete files of previous runs...")
    FileHelper.deleteAllFilesInFolder(pdf_images_path)
    FileHelper.deleteAllFilesInFolder(preprocessed_images_path)
    FileHelper.deleteAllFilesInFolder(treated_pdfs_path)

    # convert PDFs to images
    pdfConverter = ConvertPdf(input_path, pdf_images_path)
    pdfConverter.convertPdfs()

    # preprocess image files
    preprocessor = Preprocessor(input_path, pdf_images_path, preprocessed_images_path, treated_pdfs_path)
    preprocessor.execute()

    # move original PDFs to backup folder
    FileHelper.moveFiles(input_path, treated_pdfs_path)

    # detect table boundaries
    detection = Detection(preprocessed_images_path, output_path, output_boundaries_path, output_pdf_path)
    detection.detectTableBoundaries()

    # combine files
    Helper.print("Start Opitcal Character Recognition...")
    Helper.print("This can take some time...")
    ocrConverter = OcrConverter()
    ocrConverter.convertAllImagesToPdfs(pdf_images_path, output_pdf_path)
    ocrConverter.combinePdfs(output_pdf_path)
    Helper.print("Opitcal Character Recognition done...")

    Helper.print("Start cleanup temporary files...")
    # delete old files (01_pdf_images, 02_preprocessed_images, 03_treated_pdfs)
    FileHelper.deleteAllFilesInFolder(pdf_images_path)
    FileHelper.deleteAllFilesInFolder(preprocessed_images_path)
    FileHelper.deleteAllFilesInFolder(treated_pdfs_path)
    Helper.print("Cleanup done...")
    Helper.print("Table Detection Done")
    Helper.print("Result Files in " + output_path)