Ejemplo n.º 1
0
 def __init__(self, path: str, debug=False):
     self.__src = iu.read_img(path)
     self.__preprocessor = Preprocessor(self.__src, debug)
     self.__region_refiner = RegionRefiner(debug)
     self.__img_resized = None
     self.__img_text = None
     self.__ccs_text = None
     self.__ccs_non_text = None
     self.__ccs_dict = None
     self.__img_contoured = None
     self.__img_labeled = None
     self.__img_contoured_original_size = None
     self.__img_labeled_original_size = None
     self.__debug = debug
     self.__img_name = path.split('/')[-1]
Ejemplo n.º 2
0
    def run(self, filename, preprocess=True, verbose=False):
        if preprocess:
            preprocessed_filename = None
            preprocessed_filename = Preprocessor().preprocess(
                filename, verbose)
            assert not preprocessed_filename is None, 'Preprocessing has failed!'
        else:
            preprocessed_filename = filename

        triples_filename = None
        openie = 'none'
        srl = True
        triples_filename = FactsExtractor().extract_triples(
            preprocessed_filename, openie, srl, verbose)
        assert not triples_filename is None, 'Facts Extraction has failed!'

        links_filename = None
        kg_base = 'ncbo'
        links_filename = Linker().link(preprocessed_filename, kg_base, verbose)
        assert not links_filename is None, 'Ontology linking has failed!'

        rdf_filename = None
        triples = triples_filename
        links = links_filename
        rdf_filename = RDFMaker().make(triples, links, verbose)
        assert not rdf_filename is None, 'RDF generation has failed!'

        png_filename = None
        png_filename = GraphGenerator().generate(rdf_filename, verbose)
        assert not png_filename is None, 'Graph image generation has failed!'
Ejemplo n.º 3
0
    def document_to_splitted_sentences(document__file_list,
                                       document__output_path):
        """

        :param document__file_list:
        :param document__output_path:
        :return:
        """
        files_path = []
        for file in document__file_list:

            file_name = file.split("/")[-1].replace(".txt", "")
            out_path = document__output_path + file_name + ".txt"

            if os.path.exists(out_path):
                print("The document %s was skipped because was processed!!!" %
                      file_name)
                files_path.append(out_path)
                continue

            if os.path.exists(file):

                print('Split processing %s ...' % file)
                document = Preprocessor().raw_document_splitter(file)

            else:
                raise Exception("Document doesn't exists!!!")

            file_name = file.split("/")[-1].replace(".txt", "")
            out_path = document__output_path + file_name + ".txt"

            with open(out_path, "w") as f:

                f.write("\n".join(document))

            files_path.append(out_path)

        return files_path
Ejemplo n.º 4
0
class DocumentAnalyzer:
    def __init__(self, path: str, debug=False):
        self.__src = iu.read_img(path)
        self.__preprocessor = Preprocessor(self.__src, debug)
        self.__region_refiner = RegionRefiner(debug)
        self.__img_resized = None
        self.__img_text = None
        self.__ccs_text = None
        self.__ccs_non_text = None
        self.__ccs_dict = None
        self.__img_contoured = None
        self.__img_labeled = None
        self.__img_contoured_original_size = None
        self.__img_labeled_original_size = None
        self.__debug = debug
        self.__img_name = path.split('/')[-1]

    def analyze_document(self):
        if self.__debug:
            iu.show_and_wait('Image', self.__src)

        preprocessed = self.__preprocess()
        self.__apply_heuristic_filter(preprocessed)
        self.__apply_mll_classifier()
        self.__segment_text()
        ccs_non_text, rect_ccs_non_text = self.__refine_non_text_elements()
        self.__classify_non_text_element(ccs_non_text, rect_ccs_non_text)
        self.__label_regions()
        self.__img_contoured_original_size = self.__rescale_img_to_original(
            self.__img_contoured)
        self.__img_labeled_original_size = self.__rescale_img_to_original(
            self.__img_labeled)
        self.__store_output_img()

    def __preprocess(self):
        self.__img_resized = self.__preprocessor.get_resized_img()
        preprocessed = self.__preprocessor.preprocess()

        return preprocessed

    def __apply_heuristic_filter(self, preprocessed):
        self.__ccs_text, self.__ccs_non_text, self.__img_text = HeuristicFilter(
            preprocessed).filter()

        if self.__debug:
            ccs_and_colors = [(self.__ccs_text, (0, 255, 0)),
                              (self.__ccs_non_text, (0, 0, 255))]
            iu.draw_contours_then_show_and_wait('Heuristic Filter',
                                                self.__img_resized,
                                                ccs_and_colors)

    def __apply_mll_classifier(self):
        self.__ccs_text, mll_ccs_non_text, img_text = MllClassifier(
            self.__img_text, self.__debug).classify_non_text_ccs()
        self.__ccs_non_text.extend(mll_ccs_non_text)

        if self.__debug:
            ccs_and_colors = [(self.__ccs_text, (0, 255, 0)),
                              (self.__ccs_non_text, (0, 0, 255))]
            iu.draw_contours_then_show_and_wait('MLL Classifier',
                                                self.__img_resized,
                                                ccs_and_colors)

    def __segment_text(self):
        self.__ccs_text, self.__ccs_non_text = TextSegmenter(
            self.__img_text, self.__ccs_text, self.__ccs_non_text,
            self.__img_resized, self.__debug).segment_text()

        if self.__debug:
            ccs_and_colors = [(self.__ccs_text, (0, 255, 0)),
                              (self.__ccs_non_text, (0, 0, 255))]
            iu.draw_contours_then_show_and_wait('Segmented',
                                                self.__img_resized,
                                                ccs_and_colors)

    def __refine_non_text_elements(self):
        return self.__region_refiner.refine_non_text_regions(
            self.__img_resized.shape[:2], self.__ccs_non_text)

    def __classify_non_text_element(self, ccs_non_text, rect_ccs_non_text):
        self.__ccs_dict = NonTextClassifier(
            self.__img_resized.shape[:2], self.__ccs_text, ccs_non_text,
            rect_ccs_non_text).classify_non_text_elements()

    def __label_regions(self):
        self.__img_contoured, self.__img_labeled = self.__region_refiner.label_regions(
            self.__img_resized, self.__ccs_dict)

        if self.__debug:
            iu.show_and_wait('Contoured', self.__img_contoured)
            iu.show_and_wait('Labeled', self.__img_labeled)

    def __rescale_img_to_original(self, img):
        return self.__preprocessor.resize_img_to_original_size(img)

    def __store_output_img(self):
        if not os.path.exists('./out'):
            os.mkdir('./out')

        if not os.path.exists('./out/img'):
            os.mkdir('./out/img')

        # iu.write_img(f'./out/img/{self.__img_name}', self.__img_contoured_original_size)
        iu.write_img(f'./out/img/labelled-{self.__img_name}',
                     self.__img_labeled_original_size)
Ejemplo n.º 5
0
    logging.info('##########     EXTRACTING PROJECT     ##########')
    dataset_path = args.PROJECT
    if not os.path.exists(dataset_path):
        sys.exit('Specified dataset not found in dataset folder. Aborting')
    a.clean_dataset(dataset_path)
    a.extr_folder_classes(dataset_path)
    a.save(cache_path)
logging.info('Finished extracting {0:.4f}s'.format(time() - t0))

# Preprocess extracted dataset. Use cached version if available or not specified otherwise.
t0 = time()
cache_path = '../dataset/cache/' + dataset_name + '_prep.pckl'
if os.path.isfile(cache_path) and (args.reload_preprocessing is False
                                   and args.reload_extraction is False):
    logging.info('########## LOADING PRERPOCESSED DATA FROM CACHE ##########')
    b = Preprocessor()
    b.load(cache_path)
else:
    logging.info('########## PREPROCESSING PROJECT ##########')
    b = Preprocessor(a.classes, type='class', pkg_start=args.pkg_start)
    b.save(cache_path)
logging.info('Finished preprocessing {0:.4f}s'.format(time() - t0))

# Choose appropriate vectorizer and initialize
if args.vectorizer == 'tfidf':
    v = TfidfVect(b.corpus)
elif args.vectorizer == 'count':
    v = CountVect(b.corpus)
else:
    logging.error('Vectorizer name not recognized. Exiting...')
    sys.exit()
Ejemplo n.º 6
0
from preprocessor.preprocessor import Preprocessor

parser = argparse.ArgumentParser(
    description='Extract all features from text and save them.')

parser.add_argument('--use_tfidf',
                    action='store_true',
                    help='if set, tf-idf embeddings will be calculated')

parser.add_argument('--use_bert',
                    action='store_true',
                    help='if set, bert embeddings will be calculated')

parser.add_argument('--use_doc2vec',
                    action='store_true',
                    help='if set, doc2vec embeddings will be calculated')

parser.add_argument('--representation_size',
                    type=int,
                    default=256,
                    help='the dimension of the embedding vectors')

if __name__ == "__main__":
    args = parser.parse_args()

    p = Preprocessor(representation_size=args.representation_size,
                     use_tfidf=args.use_tfidf,
                     use_bert=args.use_bert,
                     use_doc2vec=args.use_doc2vec)
    p.preprocess()
Ejemplo n.º 7
0
import argparse

import yaml

from preprocessor.preprocessor import Preprocessor

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("config", type=str, help="path to preprocess.yaml")
    args = parser.parse_args()

    config = yaml.load(open(args.config, "r"), Loader=yaml.FullLoader)
    preprocessor = Preprocessor(config)
    preprocessor.build_from_path()
Ejemplo n.º 8
0
import argparse
import os

from preprocessor.preprocessor import Preprocessor

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Preprocessor arguments.')
    path = os.getcwd()
    parser.add_argument('-p', '--path', dest='path', default='data',
                        help='the location of directory with raw audio and json files', type=str)
    parser.add_argument('-g', '--gender', dest='gender', default='mand',
                        help='the gender of the speaker')
    parser.add_argument('-a', '--age', dest='age', default=25,
                        help='the age of the speaker', type=int)
    parser.add_argument('-t', '--validation-size', dest='test_size', default=0.33,
                        help='the size of the training set', type=float)

    args = parser.parse_args()

    preprocessor = Preprocessor('{}/{}'.format(path, args.path))
    preprocessor.convert_files()
    preprocessor.parse_json_to_csv(args.gender, args.age)
    preprocessor.split_set(args.test_size)
Ejemplo n.º 9
0
from preprocessor.preprocessor import Preprocessor
from dataloader.dataloader import Dataloader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

parser = argparse.ArgumentParser()

parser.add_argument("-d", "--dataset", required=True, help="path to input dataset")
parser.add_argument("-k", "--neighbors", type=int, default=1, help="number of nearest neighbours for classification")
parser.add_argument("-j", "--jobs", type=int, default=-1, help="number of jobs for k-NN distance(-1 uses all available cores)")

args = vars(parser.parse_args())

image_paths = list(paths.list_images(args["dataset"]))
preprocessor = Preprocessor(64, 64)
loader = Dataloader(preprocessors=[preprocessor])
(data, labels) = loader.load(image_paths, verbose = 100)
#print("DATA: ", data.shape, labels)
data = data.reshape((data.shape[0], 12288))

le = LabelEncoder()
labels = le.fit_transform(labels)

(trainX, testX, trainY, testY) = train_test_split(data, labels, test_size=0.25, random_state=42)

model = KNeighborsClassifier(n_neighbors=args["neighbors"], n_jobs=args["jobs"])
model.fit(trainX, trainY)
print(classification_report(testY, model.predict(testX), target_names=le.classes_))
Ejemplo n.º 10
0
 def __init__(self, repository: Repository, preprocessor: Preprocessor):
     self.repository = repository
     self.preprocessor = preprocessor
     self.data_source = preprocessor.run(repository.get_data())