def test_process_labels_target_five_labels(self):
        input = cleandoc('''
            jump :label1 notEqual @unit null
            label1:
            label2:
            jump :label2 notEqual @unit null
            ubind @mono
            label3:
            jump :label5 notEqual @unit null
            label4:
            end
            jump :label4 notEqual @unit null
            label5:
            jump :label3 notEqual @unit null
        ''')
        expected = cleandoc('''
            jump 2 notEqual @unit null
            jump 2 notEqual @unit null
            ubind @mono
            jump 7 notEqual @unit null
            end
            jump 5 notEqual @unit null
            jump 4 notEqual @unit null
        ''')

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
    def test__to_label_to_index(self):
        """Make sure label to index for y works"""
        # Arrange
        x = [
            "[CLS]"
            "Comparison", "with", "alkaline", "phosphatases", "and",
            "5-nucleotidase", "[PAD]"
        ]
        y = ["[PAD]", "o", "o", "s", "sc", "o", "s", "[PAD]"]
        labels = ["s", "sc", "o"]

        fake_labels = ["[PAD]"] + ["s", "sc", "o"]

        expected_y = [fake_labels.index(x) - 1 for x in y]

        tokensier = MagicMock()
        tokensier.tokenize.side_effect = lambda x: x.split(" ")

        label_mapper = MagicMock()
        label_mapper.entity_labels = ["s"]
        label_mapper.continuation_symbol = {"s": "sc"}
        label_mapper.other_label = "o"
        label_mapper.label_to_index = lambda x: labels.index(x)

        sut = Preprocessor(max_feature_len=5,
                           tokeniser=tokensier,
                           label_mapper=label_mapper)
        sut._x = x
        sut._y = y

        # Act
        sut._to_label_index()

        # Assert
        self.assertSequenceEqual(expected_y, sut._y)
def main():
    train = pd.read_csv('../data/train.csv')
    test = pd.read_csv('../data/test.csv')

    X = train[selected_fields]

    X[encoding_fields].astype('category', copy=False)
    y = train['SalePrice']

    preprocessor = Preprocessor()
    preprocessor.train(X)
    X = preprocessor.transform(X)
    y = np.array(y)

    clf = RandomForestRegressor()
    score = cross_val_score(clf, X, y, cv=5)
    print(score)

    X_test = test[selected_fields]
    X_test[encoding_fields].astype('category', copy=False)
    X_test = preprocessor.transform(X_test)
    clf.fit(X, y)
    y_pred = clf.predict(X_test)

    result = pd.DataFrame({
        'SalePrice': y_pred
    }, index=test['Id'])
    result.to_csv('output.csv')
    def test__tokenise(self):
        # Arrange
        x = [
            "Comparison with", "alkaline phosphatases", "and", "5-nucleotidase"
        ]
        y = ["o", "s", "o", "s"]

        expected_x = [
            "Comparison", "with", "alkaline", "phosphatases", "and",
            "5-nucleotidase"
        ]
        expected_y = ["o", "o", "s", "sc", "o", "s"]

        tokensier = MagicMock()
        tokensier.tokenize.side_effect = lambda x: x.split(" ")

        label_mapper = MagicMock()
        label_mapper.entity_labels = ["s"]
        label_mapper.continuation_symbol = {"s": "sc"}
        label_mapper.other_label = "o"

        sut = Preprocessor(max_feature_len=5,
                           tokeniser=tokensier,
                           label_mapper=label_mapper)
        sut._x = x
        sut._y = y

        # Act
        sut._tokenise()

        # Assert
        self.assertSequenceEqual(expected_x, sut._x)
        self.assertSequenceEqual(expected_y, sut._y)
    def test__tokenise_label_len_matches_tokens(self):
        # Arrange
        x = [
            "Comparison with", "alkaline phosphatases", "and", " ", " ",
            "5-nucleotidase"
        ]
        y = ["o", "s", "o", "o", "o", "s"]

        tokensier = MagicMock()
        tokensier.tokenize.side_effect = lambda z: list(
            filter(lambda y: y != "", z.split(" ")))

        label_mapper = MagicMock()
        label_mapper.entity_labels = ["s"]
        label_mapper.continuation_symbol = {"s": "sc"}
        label_mapper.other_label = "o"

        sut = Preprocessor(max_feature_len=5,
                           tokeniser=tokensier,
                           label_mapper=label_mapper)
        sut._x = x
        sut._y = y

        # Act
        sut._tokenise()

        # Assert
        self.assertEqual(len(sut._x), len(sut._y))
Beispiel #6
0
 def __init__(self, args):
     self.args = args
     self.args.device = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     self.model_param_fname = os.path.join(
         "weights",
         f"best_model_{self.args.pretrained_model}_{self.args.cnn_mode}.pt")
     self.preprocessor = Preprocessor(self.args)
    def test_process_labels(self):
        input = CUSTOM_SIMPLE
        expected = STANDARD_SIMPLE

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
    def test_process_labels_target_does_not_exist(self):
        input = cleandoc('''
            jump :label1 notEqual @unit null
            ubind @mono
            end
        ''')

        uut = Preprocessor(input)

        with self.assertRaises(ValueError):
            uut.process_labels()
    def test_process_labels_bad_target_error_line_number_simple_program(self):
        input = cleandoc('''
            jump :label1 notEqual @unit null
            ubind @mono
            end
        ''')

        uut = Preprocessor(input)

        with self.assertRaisesRegex(ValueError, r'Line 1'):
            uut.process_labels()
Beispiel #10
0
def preprocess(p_src: str) -> str:
    global tok_debug
    global parse_debug
    global env_debug

    pps = Preprocessor()
    src, flags = pps.scan(p_src)

    tok_debug = flags["tok_debug"]
    parse_debug = flags["parse_debug"]
    env_debug = flags["env_debug"]

    return src
    def test_process_labels_unreferenced_target(self):
        input = cleandoc('''
            jump 3 notEqual @unit null
            ubind @mono
            label1:
            end
        ''')
        expected = STANDARD_SIMPLE

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
    def test_process_labels_consecutive_targets(self):
        input = cleandoc('''
            jump :label2 notEqual @unit null
            ubind @mono
            label1:
            label2:
            label3:
            end
        ''')
        expected = STANDARD_SIMPLE

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
Beispiel #13
0
def main(config):
    scanner = Scanner(config)
    preprocessor = Preprocessor(config)
    utils = Utils(config)

    scanner.scan(front=True)
    if config.manual_duplex:
        print('rotate pages and press enter')
        input()
        scanner.scan(front=False)
    pages = scanner.get_pages()
    preprocessor.process(pages)
    exporter = Exporter(config)
    exporter.save_as_pdf(pages)
    if utils.show_preview():
        exporter.upload_doc()
    utils.clean_up(pages)
    def test_process_labels_bad_target_error_line_number_between_other_targets(self):
        input = cleandoc('''
            jump :label1 notEqual @unit null
            label1:
            label2:
            jump :label2 notEqual @unit null
            ubind @mono
            label3:
            jump :label5 notEqual @unit null
            end
            jump :label4 notEqual @unit null
            label5:
            jump :label3 notEqual @unit null
        ''')

        uut = Preprocessor(input)

        with self.assertRaisesRegex(ValueError, r'Line 9'):
            uut.process_labels()
Beispiel #15
0
def preprocess_pipeline():
    """Read in raw lexicon and create preprocessed version."""
    for language, n in config.LANGUAGES_N:
        print("Preprocessing lexica for: {language}".format(language=language))
        if not os.path.exists("data/processed/{lan}".format(lan=language)):
            print("Creating directory: data/processed/{lan}".format(
                lan=language))
            os.mkdir("data/processed/{lan}".format(lan=language))
        if not os.path.exists(
                "data/processed/{lan}/reals".format(lan=language)):
            print("Creating directory: data/processed/{lan}/reals".format(
                lan=language))
            os.mkdir("data/processed/{lan}/reals".format(lan=language))
        config_dict = get_config_dict(config, language)
        ## Reset n according to language
        config_dict['n'] = n
        preprocessor = Preprocessor(**config_dict)
        info_for_generation = preprocessor.preprocess_lexicon()
        print("Now getting minimal pairs")
        preprocessor.get_minimal_pairs()
    def test_process_labels_target_two_labels(self):
        input = cleandoc('''
            label1:
            jump :label2 notEqual @unit null
            ubind @mono
            label2:
            jump :label1 notEqual @unit null
            end
        ''')
        expected = cleandoc('''
            jump 3 notEqual @unit null
            ubind @mono
            jump 1 notEqual @unit null
            end
        ''')

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
    def test_process_labels_empty_lines(self):
        input = cleandoc('''
            jump :label1 notEqual @unit null
            ubind @mono

            label1:

            end
        ''')
        expected = cleandoc('''
            jump 4 notEqual @unit null
            ubind @mono


            end
        ''')

        uut = Preprocessor(input)
        uut.process_labels()

        actual = uut.result

        self.assertEqual(expected, actual)
    def test__call__no_label_runs_without_exceptions(self):
        x = [
            "Comparison with", "alkaline phosphatases", "and", "5-nucleotidase"
        ]
        labels = ["s", "sc", "o"]

        tokensier = MagicMock()
        tokensier.tokenize.side_effect = lambda x: x.split(" ")

        label_mapper = MagicMock()
        label_mapper.entity_labels = ["s"]
        label_mapper.continuation_symbol = {"s": "sc"}
        label_mapper.other_label = "o"
        label_mapper.label_to_index = lambda x: labels.index(x)

        sut = Preprocessor(max_feature_len=5,
                           tokeniser=tokensier,
                           label_mapper=label_mapper)

        # Act
        x, y = sut(x)

        # Assert
        self.assertIsNone(y)
Beispiel #19
0
def preprocess_lexicon(language):
    """Preprocess lexicon."""
    config_dict = get_config_dict(config, language)
    preprocessor = Preprocessor(**config_dict)
    info_for_generation = preprocessor.preprocess_lexicon()
    return info_for_generation
Beispiel #20
0
import argparse
import sys

from src.preprocessor import Preprocessor


def parse_args():
    '''Parse and return command line arguments.'''
    parser = argparse.ArgumentParser(description='Mindustry Logic Preprocessor')
    parser.add_argument('--infile', '-f', nargs='?', type=argparse.FileType('r'),
                        help='Path to input file', default=sys.stdin)
    parser.add_argument('--outfile', '-o', nargs='?', type=argparse.FileType('w'),
                        help='Path to output file', default=sys.stdout)

    return parser.parse_args()


if __name__ == '__main__':
    cli = parse_args()

    with cli.infile as input:
        processor = Preprocessor(input.read())

    try:
        processor.process_labels()
    except Exception as exc:
        sys.stderr.write(str(exc))

    with cli.outfile as output:
        output.write(processor.get_result())
Beispiel #21
0
from src.preprocessor import Preprocessor

if __name__ == '__main__':
    preprocessor = Preprocessor()
    preprocessor.run(metadata_filename='./data/metadata.feather')
    preprocessor.save(tfidf_weight_filename='./data/tfidf.json', idf_filename='./data/idf.json')
Beispiel #22
0
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from src.preprocessor import Preprocessor
from random import sample
import numpy as np

import src.conf as conf
import pickle

test = open(conf.project_path + '/data/SemEval2016-task4-test.subtask-BD.txt',
            'r').readlines()
gold = open(
    conf.project_path + '/data/SemEval2016_task4_subtaskB_test_gold.txt',
    'r').readlines()

p = Preprocessor()

preprocess = False

if preprocess:
    dataset = [p.preprocess(t.split('\t')[3].replace('\n', '')) for t in test]
    pickle.dump(
        dataset, open(conf.project_path + 'data/test_preprocessed.pickle',
                      'wb'))
else:
    dataset = pickle.load(
        open(conf.project_path + 'data/test_preprocessed.pickle', 'rb'))

labels = [t.split('\t')[2].replace('\n', '') for t in gold]

test_dataset = [(example, label) for example, label in zip(dataset, labels)]
Beispiel #23
0
def main():

    if len(sys.argv) < 2:
        Helper.print("Too few arguments given. Use -help to get help.")
        return

    if len(sys.argv) > 3:
        Helper.print("Too many arguments given. Use -help to get help.")
        return

    input_path_string = sys.argv[1]

    if input_path_string == "-help":
        Helper.print("Following arguments are required:")
        Helper.print("[0] absolute path to source folder")
        Helper.print("[1] absolute path to output folder")
        Helper.print("Example: \"C:\\temp\\data\\input\" \"C:\\temp\\data\\output\"")
        return

    output_path_string = sys.argv[2]

    Helper.blockPrint()
    # declare folder paths
    root_dir = os.path.abspath(os.sep)
    root = os.path.join(root_dir, "temp", "TabExImg")
    FileHelper.createPathIfNotExisting(root)
    input_path = input_path_string  # multiple scanned PDFs
    FileHelper.createPathIfNotExisting(input_path)
    pdf_images_path = os.path.join(root, "01_pdf_images")  # multiple scanned PDFs
    FileHelper.createPathIfNotExisting(pdf_images_path)
    preprocessed_images_path = os.path.join(root, "02_preprocessed_images")  # folder per pdf | preprocessed images
    FileHelper.createPathIfNotExisting(preprocessed_images_path)
    treated_pdfs_path = os.path.join(root, "03_treated_pdfs")
    FileHelper.createPathIfNotExisting(treated_pdfs_path)
    output_path = output_path_string
    FileHelper.createPathIfNotExisting(output_path)
    output_boundaries_path = os.path.join(output_path, "excel")
    FileHelper.createPathIfNotExisting(output_boundaries_path)
    output_pdf_path = os.path.join(output_path, "pdf")
    FileHelper.createPathIfNotExisting(output_pdf_path)

    # delete eventually still existing old files (01_pdf_images, 02_preprocessed_images, 03_treated_pdfs)
    Helper.print("Precautionary delete files of previous runs...")
    FileHelper.deleteAllFilesInFolder(pdf_images_path)
    FileHelper.deleteAllFilesInFolder(preprocessed_images_path)
    FileHelper.deleteAllFilesInFolder(treated_pdfs_path)

    # convert PDFs to images
    pdfConverter = ConvertPdf(input_path, pdf_images_path)
    pdfConverter.convertPdfs()

    # preprocess image files
    preprocessor = Preprocessor(input_path, pdf_images_path, preprocessed_images_path, treated_pdfs_path)
    preprocessor.execute()

    # move original PDFs to backup folder
    FileHelper.moveFiles(input_path, treated_pdfs_path)

    # detect table boundaries
    detection = Detection(preprocessed_images_path, output_path, output_boundaries_path, output_pdf_path)
    detection.detectTableBoundaries()

    # combine files
    Helper.print("Start Opitcal Character Recognition...")
    Helper.print("This can take some time...")
    ocrConverter = OcrConverter()
    ocrConverter.convertAllImagesToPdfs(pdf_images_path, output_pdf_path)
    ocrConverter.combinePdfs(output_pdf_path)
    Helper.print("Opitcal Character Recognition done...")

    Helper.print("Start cleanup temporary files...")
    # delete old files (01_pdf_images, 02_preprocessed_images, 03_treated_pdfs)
    FileHelper.deleteAllFilesInFolder(pdf_images_path)
    FileHelper.deleteAllFilesInFolder(preprocessed_images_path)
    FileHelper.deleteAllFilesInFolder(treated_pdfs_path)
    Helper.print("Cleanup done...")
    Helper.print("Table Detection Done")
    Helper.print("Result Files in " + output_path)
    def test_instance(self):
        input = ''

        _uut = Preprocessor(input)