Ejemplo n.º 1
0
    def table_parser(self, file_name, root):

        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        db_build_view = db.get_build_view()

        cursor = db_build_view._cursor

        # setup logging
        LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
        LOGGING_PATH = 'tmp/tableclasses-dbinsert.log'
        logging.basicConfig(filename=LOGGING_PATH, level=logging.DEBUG, format=LOGGING_FORMAT, filemode='w')

        html_parser = WikipediaHTMLTableParser()
        zip_file_path = os.path.join(root, file_name)
        html = self.zip2html(zip_file_path)
        html_parser.feed(html.decode('utf-8'))
        source_article_id = file_name.split('_')[1]
        try:
            fed_parser = WikipediaFedTextParser(html_parser.get_data())
            table_classes = fed_parser.table_classes(None)
            table_classes = list(set(table_classes))
            for table_class in table_classes:
               self.insert_table_class(source_article_id, table_class, cursor)
        except KeyError:
            db_build_view._db_connection.rollback()
            logging.error('KeyError FedTextParser source article id: %s ' % source_article_id)
        db_build_view.commit()
        db_build_view.reset_cache()
Ejemplo n.º 2
0
    def table_parser(self, file_name, root):

        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                           DATABASE_NAME)
        db_build_view = db.get_build_view()

        cursor = db_build_view._cursor

        # setup logging
        LOGGING_FORMAT = '%(levelname)s:\t%(asctime)-15s %(message)s'
        LOGGING_PATH = 'tmp/tableclasses-dbinsert.log'
        logging.basicConfig(filename=LOGGING_PATH,
                            level=logging.DEBUG,
                            format=LOGGING_FORMAT,
                            filemode='w')

        html_parser = WikipediaHTMLTableParser()
        zip_file_path = os.path.join(root, file_name)
        html = self.zip2html(zip_file_path)
        html_parser.feed(html.decode('utf-8'))
        source_article_id = file_name.split('_')[1]
        try:
            fed_parser = WikipediaFedTextParser(html_parser.get_data())
            table_classes = fed_parser.table_classes(None)
            table_classes = list(set(table_classes))
            for table_class in table_classes:
                self.insert_table_class(source_article_id, table_class, cursor)
        except KeyError:
            db_build_view._db_connection.rollback()
            logging.error('KeyError FedTextParser source article id: %s ' %
                          source_article_id)
        db_build_view.commit()
        db_build_view.reset_cache()
    def __init__(self, path):
        #os.environ["DISPLAY"]=":1"
        print path
        os.environ["DISPLAY"]=":1"
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
        self.db_build_view = db.get_build_view()
        self.cursor = self.db_build_view._cursor

        self.app = QApplication(sys.argv)
        self.path = path
Ejemplo n.º 4
0
    def __init__(self, path):
        #os.environ["DISPLAY"]=":1"
        print path
        os.environ["DISPLAY"] = ":1"
        db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                           DATABASE_NAME)
        self.db_build_view = db.get_build_view()
        self.cursor = self.db_build_view._cursor

        self.app = QApplication(sys.argv)
        self.path = path
Ejemplo n.º 5
0
    def _extract_articles(self):

        INPUT_FILE = WIKI_DUMP_XML_FILE  #self.read_path('Please enter the path of the wiki dump file [.xml]')
        #INPUT_FILE = "/home/ddimitrov/wikiwsd/data/training.xml"#self.read_path('Please enter the path of the wiki dump file [.xml]')
        MAX_ARTICLES_IN_QUEUE = 200  #self.read_number('How many articles should be kept in the memory at any time at most?', 200, 20, 1000)
        NUM_THREADS = 1  #self.read_number('How many threads shall be used to write to the database?', 20, 1, 50)
        CONTINUE = True  #self.read_yes_no('This process might take several days to finish.\nDo you want to continue?')

        if CONTINUE:
            # measure time
            start = time.clock()

            # connect to database and create article queue
            db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD,
                               DATABASE_NAME)
            queue = Queue.Queue(maxsize=MAX_ARTICLES_IN_QUEUE)

            # create reader and threads
            reader = WikipediaReader(INPUT_FILE, queue, extract_text=False)
            threads = []
            for i in range(0, NUM_THREADS):
                inserter = ArticleInserter(queue, db.get_build_view())
                threads.append(inserter)

            # start reader
            reader.start()

            # start insert threads
            for thread in threads:
                thread.start()

            # wait for reading thread, queue and inserters to be done
            reader.join()
            queue.join()
            for thread in threads:
                thread.end()
            for thread in threads:
                thread.join()

            seconds = round(time.clock() - start)
            print 'Finished after %02d:%02d minutes' % (seconds / 60,
                                                        seconds % 60)

        else:
            print 'Aborting...'
Ejemplo n.º 6
0
    def _extract_articles(self):


        INPUT_FILE = WIKI_DUMP_XML_FILE #self.read_path('Please enter the path of the wiki dump file [.xml]')
        #INPUT_FILE = "/home/ddimitrov/wikiwsd/data/training.xml"#self.read_path('Please enter the path of the wiki dump file [.xml]')
        MAX_ARTICLES_IN_QUEUE = 200#self.read_number('How many articles should be kept in the memory at any time at most?', 200, 20, 1000)
        NUM_THREADS = 1#self.read_number('How many threads shall be used to write to the database?', 20, 1, 50)
        CONTINUE = True#self.read_yes_no('This process might take several days to finish.\nDo you want to continue?')

        if CONTINUE:
            # measure time
            start = time.clock()

            # connect to database and create article queue
            db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
            queue = Queue.Queue(maxsize=MAX_ARTICLES_IN_QUEUE)

            # create reader and threads
            reader = WikipediaReader(INPUT_FILE, queue, extract_text=False)
            threads = []
            for i in range(0, NUM_THREADS):
                inserter = ArticleInserter(queue, db.get_build_view())
                threads.append(inserter)

            # start reader
            reader.start()

            # start insert threads
            for thread in threads:
                thread.start()

            # wait for reading thread, queue and inserters to be done
            reader.join()
            queue.join()
            for thread in threads:
                thread.end()
            for thread in threads:
                thread.join()

            seconds = round (time.clock() - start)
            print 'Finished after %02d:%02d minutes' % (seconds / 60, seconds % 60)

        else:
            print 'Aborting...'