def test_to_json(self): logger.info('Loading Configuration..') configuration = Configuration( config_src=os.path.join(self.test_data_path, 'template_conf.yml')) expected_json = { 'aws': [{ 'config': { 'access_key': 'access_key_1', 'secret_key': 'secret_key_1', 'instance_id': 'instance_id_1', 'ec2_region': 'ec2_region_1', 'ec2_amis': ['ec2_ami_1'], 'ec2_keypair': 'ec2_keypair_1', 'ec2_secgroups': ['ec2_secgroup_1'], 'ec2_instancetype': 'ec2_instancetype_1' } }], 'mineserver': [{ 'config': { 'ssh_key_file_path': 'ssh_key_file_path_1', 'memory_allocation': 'memory_allocation_1' } }], 'web_client': [{ 'config': { 'server_password': '******' } }] } # Compare logger.info('Comparing the results..') self.assertDictEqual(self._sort_dict(expected_json), self._sort_dict(configuration.to_json()))
def test_to_json(self): logger.info('Loading Configuration..') configuration = Configuration( config_src=os.path.join(self.test_data_path, 'template_conf.yml')) expected_json = { 'tag': 'production', 'datastore': [{ 'config': { 'hostname': 'host123', 'username': '******', 'password': '******', 'db_name': 'db3', 'port': 3306 }, 'type': 'mysql' }], 'cloudstore': [{ 'config': { 'api_key': 'apiqwerty' }, 'type': 'dropbox' }] } # Compare logger.info('Comparing the results..') self.assertDictEqual(self._sort_dict(expected_json), self._sort_dict(configuration.to_json()))
def __init__(self, language): """Initalization of WikiCorpus instance :language: unicode """ # TODO: check if language is in dictionary of iso codes self._language = language # load configuration self._configuration = Configuration(WikiCorpus.CORPUS_CONFIG_PATH)
def setup_classes(config_file: str, log: str, debug: bool): # Initialize _setup_log(log_path=log, debug=debug) # Load the configuration logger = logging.getLogger('Init') logger.debug("Loading the configs..") config = Configuration(config_src=config_file) aws_config = config.get_aws_configs()[0] mineserver_config = config.get_mineserver_configs()[0] web_client_config = config.get_web_client_configs()[0] web_client_config['permitted_days'] = [day.strip() for day in web_client_config['permitted_days'].split(',')] return logger, config, aws_config, mineserver_config, web_client_config
def test_to_yaml(self): logger.info('Loading Configuration..') configuration = Configuration( config_src=os.path.join(self.test_data_path, 'template_conf.yml')) # Modify and export yml logger.info('Changed the host and the api_key..') configuration.datastore[0]['config']['hostname'] = 'changedhost' logger.info('Exporting to yaml..') configuration.to_yaml( 'test_data/test_configuration/actual_output_to_yaml.yml') # Load the modified yml logger.info('Loading the exported yaml..') modified_configuration = Configuration(config_src=os.path.join( self.test_data_path, 'actual_output_to_yaml.yml')) # Compare logger.info('Comparing the results..') expected_json = { 'tag': 'production', 'datastore': [{ 'config': { 'hostname': 'changedhost', 'username': '******', 'password': '******', 'db_name': 'db3', 'port': 3306 }, 'type': 'mysql' }] } self.assertDictEqual(self._sort_dict(expected_json), self._sort_dict(modified_configuration.to_json()))
def __init__(self, url): self.url = url self.doc = "" self.rawtext = "" self.data = set() self.config = Configuration() self.helper = Helper()
def __init__(self, file, keywords): self.config = Configuration() self.keywords = keywords self.file = file self.urls = [] self.data = [] self.helper = Helper()
def __create_parser(configuration: Configuration): parser = None language = configuration.get_parser() if language == Languages.ENGLISH: parser = EnglishWiktionaryParser(configuration) elif language == Languages.POLISH: parser = PolishWiktionaryParser(configuration) return parser
def init_main() -> Tuple[argparse.Namespace, Configuration]: args = _argparser() _setup_log(args.log, args.debug) logger.info("Starting in run mode: {0}".format(args.run_mode)) # Load the configuration configuration = Configuration(config_src=args.config_file) return args, configuration
def setUpClass(cls): cls._setup_log() if "DROPBOX_API_KEY" not in os.environ: logger.error('DROPBOX_API_KEY env variable is not set!') raise Exception('DROPBOX_API_KEY env variable is not set!') logger.info('Loading Configuration..') cls.configuration = Configuration( config_src=os.path.join(cls.test_data_path, 'template_conf.yml'))
def parse(self): return Configuration(original_repo=self.args['source_repo'], new_repo_namespace=self.args['dst_repo'], working_directory=self.args['cwd'], sub_folder=self.args['sub_folder'], allowed_folders=self.args['includes'], not_allowed_folders=self.args['excludes'], regex_for_folder_name=self.args['regexp'], branch=self.args['branch'])
def setUpClass(cls): cls._setup_log() gmail_os_vars = ['EMAIL_ADDRESS', 'GMAIL_API_KEY'] if not all(gmail_os_var in os.environ for gmail_os_var in gmail_os_vars): logger.error('Gmail env variables are not set!') raise Exception('Gmail env variables are not set!') logger.info('Loading Configuration..') cls.configuration = Configuration( config_src=os.path.join(cls.test_data_path, 'template_conf.yml'))
def setUpClass(cls): cls._setup_log() if "DROPBOX_API_KEY" not in os.environ: logger.error('DROPBOX_API_KEY env variable is not set!') raise Exception('DROPBOX_API_KEY env variable is not set!') logger.info('Loading Configuration..') cls.configuration = Configuration(config_src=os.path.join(cls.test_data_path, 'template_conf_all_args.yml')) cls.remote_tests_folder = '/job_bot_tests' cloud_store = JobBotDropboxCloudstore(config=cls.configuration.get_cloudstores()[0]) cloud_store.delete_file(cls.remote_tests_folder)
def test_schema_validation(self): try: logger.info('Loading the correct Configuration..') Configuration(config_src=os.path.join(self.test_data_path, 'minimal_conf_correct.yml'), config_schema_path=os.path.join( '..', 'tests', self.test_data_path, 'minimal_yml_schema.json')) except ValidationError as e: logger.error('Error validating the correct yml: %s', e) self.fail('Error validating the correct yml') else: logger.info('First yml validated successfully.') with self.assertRaises(ValidationError): logger.info('Loading the wrong Configuration..') Configuration(config_src=os.path.join(self.test_data_path, 'minimal_conf_wrong.yml')) logger.info('Second yml failed to validate successfully.')
def test_init(self): req_only_conf = Configuration( config_src=os.path.join(self.test_data_path, 'template_conf_required_args_only.yml')) cloud_store = JobBotDropboxCloudstore(config=self.configuration.get_cloudstores()[0], remote_files_folder=self.remote_tests_folder) boolean_attributes = [True if len(cloud_store.attachments_names) > 0 else False, cloud_store._update_stop_words, cloud_store._update_application_to_send_email, cloud_store._update_inform_success_email, cloud_store._update_inform_should_call_email] self.assertTrue(True, all(boolean_attributes)) req_only_cloud_store = JobBotDropboxCloudstore(config=req_only_conf.get_cloudstores()[0], remote_files_folder=self.remote_tests_folder) req_only_boolean_attributes = [True if len(req_only_cloud_store.attachments_names) == 0 else False, not req_only_cloud_store._update_stop_words, not req_only_cloud_store._update_application_to_send_email, not req_only_cloud_store._update_inform_success_email, not req_only_cloud_store._update_inform_should_call_email] self.assertTrue(True, all(req_only_boolean_attributes))
def create_config(data_loaded): migrator = data_loaded['migrator'] return Configuration( original_repo=migrator['original_repo'], new_repo_namespace=migrator['new_repo_namespace'], working_directory=migrator['working_directory'], sub_folder=migrator['sub_folder'], allowed_folders=migrator['includes'], not_allowed_folders=migrator['excludes'], regex_for_folder_name=migrator['regex_folder_name'], branch=migrator['branch'])
def isConnected(self): config = Configuration() try: response = requests.get(config.global_check_url, timeout=2) status = "GLOBAL" except: try: response = requests.get(config.china_check_url, timeout=2) status = "CHINA" except: status = "NONETWORK" return status
def setUpClass(cls): cls._setup_log() mysql_os_vars = [ 'MYSQL_HOST', 'MYSQL_USERNAME', 'MYSQL_PASSWORD', 'MYSQL_DB_NAME' ] if not all(mysql_os_var in os.environ for mysql_os_var in mysql_os_vars): logger.error('Mysql env variables are not set!') raise Exception('Mysql env variables are not set!') logger.info('Loading Configuration..') cls.configuration = Configuration( config_src=os.path.join(cls.test_data_path, 'template_conf.yml'))
def test_to_yaml(self): logger.info('Loading Configuration..') configuration = Configuration( config_src=os.path.join(self.test_data_path, 'template_conf.yml')) # Modify and export yml logger.info('Changed the host and the api_key..') configuration.aws[0]['config']['access_key'] = 'access_key_2' configuration.mineserver[0]['config'][ 'ssh_key_file_path'] = 'ssh_key_file_path_2' logger.info('Exporting to yaml..') configuration.to_yaml( 'test_data/test_configuration/actual_output_to_yaml.yml') # Load the modified yml logger.info('Loading the exported yaml..') modified_configuration = Configuration(config_src=os.path.join( self.test_data_path, 'actual_output_to_yaml.yml')) # Compare logger.info('Comparing the results..') expected_json = { 'aws': [{ 'config': { 'access_key': 'access_key_2', 'secret_key': 'secret_key_1', 'instance_id': 'instance_id_1', 'ec2_region': 'ec2_region_1', 'ec2_amis': ['ec2_ami_1'], 'ec2_keypair': 'ec2_keypair_1', 'ec2_secgroups': ['ec2_secgroup_1'], 'ec2_instancetype': 'ec2_instancetype_1' } }], 'mineserver': [{ 'config': { 'ssh_key_file_path': 'ssh_key_file_path_2', 'memory_allocation': 'memory_allocation_1' } }], 'web_client': [{ 'config': { 'server_password': '******' } }] } self.assertDictEqual(self._sort_dict(expected_json), self._sort_dict(modified_configuration.to_json()))
def setup() -> Tuple[Dict, Dict, Dict, Dict, str]: """Setup the configuration and the run properties.""" args = _argparser() # Temporary logging # noinspection PyArgumentList logging.basicConfig( level=logging.INFO if not args.debug else logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%Y-%m-%d %H:%M:%S', handlers=[logging.StreamHandler()]) # Load the configuration config = Configuration(config_src=args.config_file) spark_config = config.get_spark_configs()[0] input_config = config.get_input_configs()[0] run_options_config = config.get_run_options_configs()[0] output_config = config.get_output_configs()[0] options_id_name = "featMinAvg-{featMinAvg}_rLvl1-{rLvl1}_" \ "rLvl2-{rLvl2}_betwThres-{betwThres}_feats-{feats}" \ .format(featMinAvg=run_options_config['feature_min_avg'], rLvl1=run_options_config['r_lvl1_thres'], rLvl2=run_options_config['r_lvl2_thres'], betwThres=run_options_config['betweenness_thres'], feats=''.join([feat[:10] for feat in run_options_config['features_to_check'][1:]])) modified_graph_name = os.path.join(input_config['name'], options_id_name) _setup_log(os.path.join(output_config['logs_folder'], modified_graph_name + '.log'), debug=args.debug) return spark_config, input_config, run_options_config, output_config, modified_graph_name
def parse(json): training_input = np.array(list(json["input"].values())).transpose() predict_input = np.array(list(json["predict_input"].values())).transpose() return Configuration( training_input, np.array(json["output"]), [Layer(training_input.shape[0], "tanh")] + list(map(lambda layer: Layer(layer["neurons"], layer["activation"]), json["layers"])), int(json["iterations"]), float(json["learning_rate"]), predict_input, np.array(json["predict_output"]), float(json["predict_threshold"]) )
class WiktionaryParser: def __init__(self, configuration_path): self._configuration = Configuration(configuration_path) self._parser = self.__create_parser(self._configuration) @staticmethod def __create_parser(configuration: Configuration): parser = None language = configuration.get_parser() if language == Languages.ENGLISH: parser = EnglishWiktionaryParser(configuration) elif language == Languages.POLISH: parser = PolishWiktionaryParser(configuration) return parser def parse(self): logging.info('Start parser') start_time = time.time() events = {'start', 'end'} parser_result = {} last_page_title = None wiktionary_path = self._configuration.get_wiktionary_path() for event, elem in ET.iterparse(wiktionary_path, events=events): tag = self.__get_tag(elem) if self.__is_start_title(tag, event): last_page_title = elem.text elif tag == _TEXT and elem.text: text = elem.text if text: result = self._parser.parse_text(text, last_page_title, parser_result) if result: parser_result[last_page_title] = result elem.clear() end_time = time.time() logging.info('Parsing time: {} s'.format(end_time - start_time)) return parser_result @staticmethod def __get_tag(elem): return elem.tag.split('}')[1] @staticmethod def __is_start_title(tag, event): return tag == _TITLE and event == _START @staticmethod def __is_end_text(tag, event): return tag == _TEXT and event == _END
def main(): """ Handles the core flow of SpotiClick. :Example: python main.py -c confs/raspotify_conf.yml -l logs/spoticlick.log """ # Initializing args = _argparser() _setup_log(args.log, args.debug) # Load the configuration configuration = Configuration(config_src=args.config_file) # Init Spotipy spoti_read_config = configuration.get_spotifies()[0] spoti_modify_config = configuration.get_spotifies()[1] target_device_id = spoti_modify_config["target_device_id"] spot_read = Spotipy(config=spoti_read_config, token_id='read') spot_modify = Spotipy(config=spoti_modify_config, token_id='modify') logger.info("Transferring music to device id: %s" % target_device_id) spot_modify.play_on_device(target_device_id=target_device_id, session_info=spot_read.get_playback_info()) logger.info("Music Transferred!")
def main(): """ Handles the core flow of SpotiClick. :Example: python main.py -c confs/raspotify_conf.yml -l logs/spoticlick.log """ # Initializing args = _argparser() _setup_log(args.log, args.debug) # Load the configuration configuration = Configuration(config_src=args.config_file) # Init Spotipy spoti_read_config = configuration.get_spotifies()[0] spoti_modify_config = configuration.get_spotifies()[1] spot_read = Spotipy(config=spoti_read_config, token_id='read') spot_modify = Spotipy(config=spoti_modify_config, token_id='modify') logger.debug("%s volume by 5%%.." % ("Increasing" if args.volume_direction == 'increase' else "Decreasing")) spot_modify.volume_update(direction=args.volume_direction, current_volume=spot_read.get_current_volume()) logger.debug("Volume changed!")
def __init__(self, configuration: Configuration, handler: Handler = None): super().__init__((configuration.get_property("server.host"), configuration.get_property("server.port")), handler) self.file_searcher = FileSearcher(True, False) self.running = False self.dr_collection = MongoClient(configuration.get_property("mongo.host"), (int)(configuration.get_property("mongo.port")))\ .get_database(configuration.get_property("mongo.database")) \ .get_collection(configuration.get_property("mongo.collection.drs")) self.dr_collection.create_index("messageId") self.dr_path = configuration.get_property("dr.path")
def main(debug=False): configuration = Configuration() folder = FolderGenerator(configuration) folder.make_folder_structure() package_json = PackageJsonGenerator(configuration) package_json.generate_package_json() extension = ExtensionGenerator(configuration) extension.generate_extension_js() coloring = ColoringVSCode(configuration) coloring.do_coloring_for_vscode() outline = OutlineVSCode(configuration) outline.do_outline_for_vscode()
def save(self): config = Configuration() with sqlite.connect(config.db_file) as connection: cursor = connection.cursor() # get data ready to insert sentence = self.sentence keyword = self.keyword choices = ",".join(self.choices) # search for current sentence in database search_sql = "select rowid from questions where sentence=?" cursor.execute(search_sql, (sentence, )) connection.commit() search_result = cursor.fetchall() # if current sentence doesn't exist in database, insert it if len(search_result) == 0: sql = f"insert into questions values (?,?,?,0)" connection.execute(sql, (sentence, keyword, choices)) connection.commit()
def edit_config(self, subfolder): relative_dataset_lg_location = '../Data/Expressions/lg_output/' relative_dataset_location = '../Data/Expressions/inkml/' config_location = self.dprl_project_location + 'configs/full_system_infty.conf' config = Configuration.from_file(config_location) #inkml inkml_path = config.get_str('TESTING_DATASET_PATH') config.set('TESTING_DATASET_PATH', relative_dataset_location + subfolder) inkml_path = config.get_str('TESTING_DATASET_PATH') config.write_to_file(config_location, 'TESTING_DATASET_PATH', inkml_path) #lg lg_path = config.get_str('TESTING_DATASET_LG_PATH') config.set('TESTING_DATASET_LG_PATH', relative_dataset_lg_location + subfolder + '_lg') lg_path = config.get_str('TESTING_DATASET_LG_PATH') config.write_to_file(config_location, 'TESTING_DATASET_LG_PATH', lg_path)
def __init__(self): print() self.config = Configuration()
#!/usr/bin/python3.5 import logging from configuration.configuration import Configuration from messaging.videoconversionmessaging import VideoConversionMessaging from database.mongodb.videoconversion import VideoConversion from videoconvunixsocket.videoconversionunixsocket import VideoConversionUnixSocket if __name__ == '__main__': logging.basicConfig(format='%(asctime)s - %(levelname)s: %(message)s', level=logging.DEBUG) configuration = Configuration() #logging.info(configuration.get_rabbitmq_host()) #logging.info(configuration.get_rabbitmq_port()) #logging.info(configuration.get_messaging_conversion_queue()) #logging.info(configuration.get_database_name()) #logging.info(configuration.get_video_conversion_collection()) video_unix_socket = VideoConversionUnixSocket() video_unix_socket.start() video_conversion_service = VideoConversion(configuration) video_messaging = VideoConversionMessaging(configuration, video_conversion_service) video_unix_socket.setVideoConversionMessaging(video_messaging)
class WikiCorpus(object): """Class representing corpus from Wikipedia of one language """ # configuration file CORPUS_CONFIG_PATH = project_path('wikicorpus/corpus-config.yaml') # original dump file name DUMP_ORIGINAL_NAME = 'pages-articles.xml.bz2' # dump url DUMP_URL_GENERAL = 'http://dumps.wikimedia.org/{lang}wiki/latest/'\ + '{lang}wiki-latest-' + DUMP_ORIGINAL_NAME # md5 checksum file url MD5_URL_GENERAL = 'http://dumps.wikimedia.org/{lang}wiki/latest/'\ + '{lang}wiki-latest-md5sums.txt' # Wikipedia namespace number label for articles ARTICLE_NS = '0' def __init__(self, language): """Initalization of WikiCorpus instance :language: unicode """ # TODO: check if language is in dictionary of iso codes self._language = language # load configuration self._configuration = Configuration(WikiCorpus.CORPUS_CONFIG_PATH) # vertical info #self._tagset = None #self._structures = None # always _BASIC_STRUCTURES # ------------------------------------------------------------------------ # getters and setters # ------------------------------------------------------------------------ def get_corpus_name(self): """ Returns corpus name """ return self._configuration.get('corpus-name').format( lang=self.language()) def get_dump_path(self): """ Returns path to dump """ # full dumps are bzipped, while sample dumps are uncompressed if self.is_dump_compressed(): ext = self._configuration.get('extensions', 'compressed-dump') else: ext = self._configuration.get('extensions', 'uncompressed-dump') # dump file name = corpus name + extension dump_file_name = '{name}.{ext}'.format( name=self.get_corpus_name(), ext=ext) # path = path to verticals + dump file name path = os.path.join( self.get_uncompiled_corpus_path(), dump_file_name) return path def get_dump_length(self): """Returns length of the dump Note: For compressed dumps, this is larger number than file size. """ if self.is_dump_compressed(): raise NotImplemented('calculated uncompressed dump length is not supported') #print 'Calculating uncompressed dump length...' #with self._open_dump() as dump_file: # dump_file.seek(0, os.SEEK_END) # length = dump_file.tell() # return length else: return os.path.getsize(self.get_dump_path()) def get_namespace(self): """Returns namespace of the wiki dump """ with self._open_dump() as dump_file: # read first event, which is ('start', root element), context_for_ns = etree.iterparse(dump_file, events=('start',)) _, root = context_for_ns.next() # get namespace information from the root element, # None means implicit namespace (without prefix) namespace = root.nsmap[None] del context_for_ns return namespace def get_prevertical_path(self): """ Returns path to prevertical """ # prevertical file name = corpus name + extension prevertical_file_name = '{name}.{ext}'.format( name=self.get_corpus_name(), ext=self._configuration.get('extensions', 'prevertical')) # path = path to verticals + prevertical file name path = os.path.join( self.get_uncompiled_corpus_path(), prevertical_file_name) return path def get_registry_path(self): """ Returns path to registry file. It will also creates non-existing directories on this path """ registry_dir = environment.registry_path() makedirs(registry_dir) path = os.path.join( registry_dir, self.get_corpus_name()) return path #def get_tagset(self): # """Returns tagset of the corpus. # @return: [registry.tagsets.tagset] || None # @throws: RegistryException # """ # # first, if _tagset is None, update the tagset ifnormation # if self._tagset is None: # self._tagset = get_registry_tagset(self.get_registry_path()) # return self._tagset def get_url_prefix(self): """Returns url prefix for all articles in the corpus. """ return 'http://{lang}.wikipedia.org/wiki'.format(lang=self.language()) def get_vertical_path(self): """ Returns path to vertical """ # vertical file name = corpus name + extension vertical_file_name = '{name}.{ext}'.format( name=self.get_corpus_name(), ext=self._configuration.get('extensions', 'vertical')) # path = path to verticals + vertical file name path = os.path.join( self.get_uncompiled_corpus_path(), vertical_file_name) return path def get_uncompiled_corpus_path(self): """ Returns path to directory with verticals for this corpus It will also creates non-existing directories on this path """ path = os.path.join( environment.verticals_path(), self.get_corpus_name()) makedirs(path) return path def get_compiled_corpus_path(self): """ Returns path to directory with compiled corpus It will also creates non-existing directories on this path """ path = os.path.join( environment.compiled_corpora_path(), self.get_corpus_name()) makedirs(path) return path #def is_sample(self): # """ Returns True if this is a sample corpus # """ # return bool(self.sample_size()) def is_dump_compressed(self): """Returns True if dumps is compress, False otherwise. """ # dumps for full languages are always compressed return True def language(self): """ Returns corpus language """ return self._language def prevertical_file_exists(self): return os.path.exists(self.get_prevertical_path()) def vertical_file_exists(self): return os.path.exists(self.get_vertical_path()) # ------------------------------------------------------------------------ # corpus building methods # ------------------------------------------------------------------------ def download_dump(self, force=False): """ Downloads dump of Wikipedia :force: Boolean if True, it downloads dump even if some dump with target name is already downloaded """ # select dump path dump_path = self.get_dump_path() if os.path.exists(dump_path) and not force: logging.info('Dump {name} already exists.'.format(name=dump_path)) return # select dump url dump_url = WikiCorpus.DUMP_URL_GENERAL.format(lang=self.language()) logging.info('Started downloading {l}-wiki dump from {url}' .format(l=self.language(), url=dump_url)) # find MD5 checksum md5_url = WikiCorpus.MD5_URL_GENERAL.format(lang=self.language()) md5sums = get_online_file(md5_url, lines=True) for file_md5, file_name in map(lambda x: x.split(), md5sums): if file_name.endswith(WikiCorpus.DUMP_ORIGINAL_NAME): md5sum = file_md5 break else: logging.warning('no matching MD5 checksum for the dump found') md5sum = None # downloading download_large_file(dump_url, dump_path, md5sum=md5sum) logging.info('Downloading of {lang}-wiki dump finished'.format( lang=self.language(), path=dump_path)) def create_prevertical(self): """ Parses dump (outer XML, inner Wiki Markup) and creates prevertical """ prevertical_path = self.get_prevertical_path() namespace = self.get_namespace() # create qualified names (= names with namespaces) for tags we need TEXT_TAG = qualified_name('text', namespace) TITLE_TAG = qualified_name('title', namespace) REDIRECT_TAG = qualified_name('redirect', namespace) NS_TAG = qualified_name('ns', namespace) logging.info('Preverticalization of {name} started...'.format( name=self.get_corpus_name())) # iterate through xml and build a sample file with open(prevertical_path, 'w') as prevertical_file: with self._open_dump() as dump_file: context = etree.iterparse(dump_file, events=('end',)) #progressbar = ProgressBar(self.get_dump_length()) last_title = None id_number = 0 # skip first page in full (copressed) dump since it's Main Page skip = True if self.is_dump_compressed() else False # iterate through end-events for event, elem in context: if elem.tag == REDIRECT_TAG: # ignore redirect pages skip = True elif elem.tag == NS_TAG: # ignore nonarticle pages (such as "Help:" etc.) if elem.text != WikiCorpus.ARTICLE_NS: skip = True elif elem.tag == TITLE_TAG: # remember the title last_title = elem.text elif elem.tag == TEXT_TAG: if skip: skip = False continue if not elem.text or not last_title: continue # new id id_number += 1 parsed_doc = parse_wikimarkup(id_number, last_title, self.get_url_prefix(), elem.text) + '\n' prevertical_file.write(parsed_doc.encode('utf-8')) # approximate work done by positin in dump file #progressbar.update(dump_file.tell()) # cleanup elem.clear() #while elem.getprevious() is not None: # del elem.getparent()[0] for ancestor in elem.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] del context #progressbar.finish() logging.info('Prevertical of {name} created at: {path}'.format( name=self.get_corpus_name(), path=prevertical_path)) def create_vertical(self): """ Creates a vertical file. Performes tokenization of prevertical and for some languages also morfologization (adding morfological tag and lemma/lempos) NOTE: Kvuli bugu v TreeTaggeru je potreba udelat nechutny hack: 1) provest v prevertikalu nasledujici substituci: </term> ---> __TERM_END__ 2) nechat TreeTagger vytvorit vertikal 3) presunout <term> a </term> na spravne misto s pouzitim vlozene znacky __TERM_END__ """ prevertical_path = self.get_prevertical_path() marked_prevert_path = prevertical_path + '.tmp' vertical_path = self.get_vertical_path() tmp_vertical_path = vertical_path + '.tmp' # check if prevertical file already exists if not self.prevertical_file_exists(): raise CorpusException('Verticalization failed: ' + 'Missing prevertical file.') logging.info('Verticalization of {name} started...'.format( name=self.get_corpus_name())) try: if self.language() == 'en': # ---------------------------------------------------------- # oprava bugu v treetaggeru, krok 1 self._mark_terms(prevertical_path, marked_prevert_path) # ---------------------------------------------------------- # create vertical file with NaturalLanguageProcessor(self.language()) as lp: lp.create_vertical_file(marked_prevert_path, tmp_vertical_path) #self._tagset = tags #self._structures = WikiCorpus._BASIC_STRUCTURES # create registry file self.create_registry() # ---------------------------------------------------------- # oprava bugu v treetaggeru, krok 3 self._correct_terms(tmp_vertical_path, vertical_path) call(('rm', marked_prevert_path, tmp_vertical_path)) # ---------------------------------------------------------- else: with NaturalLanguageProcessor(self.language()) as lp: lp.create_vertical_file(prevertical_path, vertical_path) self.create_registry() logging.info('Vertical of {name} created at: {path}'.format( name=self.get_corpus_name(), path=vertical_path)) except ConfigurationException as exc: raise CorpusException('Verticalization failed: ' + exc.message) except LanguageProcessorException as exc: raise CorpusException('Verticalization failed: ' + exc.message) def _mark_terms(self, prevert_path, marked_prevert_path): cmd = "sed 's/<\/term>/ __TERM_END__/g' {fr} > {to}".format( fr=prevert_path, to=marked_prevert_path) task = Popen(cmd, shell=True) task.wait() if task.returncode != 0: raise CorpusException('sed error') def _correct_terms(self, input_path, output_path): last_term_line = None open_term = False #state = 0 # = pocet radku spatne posunuteho termu with open(input_path) as input_file: with open(output_path, 'w') as output_file: for encoded_line in input_file: line = encoded_line.decode('utf-8') if line.startswith('<term '): last_term_line = encoded_line elif line.startswith('</term>'): # ignore continue elif line.startswith('<s>'): output_file.write(encoded_line) if last_term_line: output_file.write(last_term_line) last_term_line = None open_term = True elif line.startswith('__TERM_END__') and open_term: output_file.write(str('</term>\n')) open_term = False elif line.startswith('<'): output_file.write(encoded_line) else: if last_term_line: output_file.write(last_term_line) last_term_line = None open_term = True output_file.write(encoded_line) # if state == 0 and line.startswith('<term '): # last_term_line = encoded_line # state += 1 # elif state == 1 and line.startswith('</term>'): # state += 1 # elif line.startswith('</term>'): # # ignore # continue # elif state == 1 and line.startswith('__TERM_END__'): # # empty term # state = 0 # last_term_line = None # elif state == 2 and line.startswith('<s>'): # output_file.write(encoded_line) # output_file.write(last_term_line) # last_term_line = None # state = 0 # elif state == 2 and line.startswith('<'): # output_file.write(encoded_line) # elif line.startswith('__TERM_END__'): # output_file.write(str('</term>\n')) # else: # if last_term_line: # output_file.write(last_term_line) # last_term_line = None # state = 0 # output_file.write(encoded_line) def infere_terms_occurences(self): """ Labels all occurences of terms in morfolgized vertical During terms-inference some postprocessing is done as well (removing desamb hacks, using actual numbers as lemmata). """ if self.language() != 'en': raise CorpusException('terms inference is currently supported only for English') vertical_path = self.get_vertical_path() try: logging.info('Terms occurences inference in {name} started'.format( name=self.get_corpus_name())) output_path = vertical_path + '.terms' #call(('cp', vertical_path, original_vertical_path)) # find tagset (throws exception if registry file not found) #tagset = self.get_tagset() tagset = TAGSETS.TREETAGGER with open(vertical_path) as input_file: with open(output_path, 'w') as output_file: for line in input_file: line = line.decode('utf-8').strip() # TODO: ?osetrit prazdne radky a podobne veci?? if line.startswith('<doc'): document = [line] else: document.append(line) # check if the end of document is reached if line == '</doc>': vertical = VerticalDocument(document, tagset=tagset, terms_inference=True) output_file.write(str(vertical)) logging.info('Terms occurences inference in {name} finished.' .format(name=self.get_corpus_name())) except CorpusException as exc: raise CorpusException('Terms inference failed: ' + exc.message) except RegistryException as exc: raise CorpusException('Terms inference failed: ' + exc.message) except LanguageProcessorException as exc: raise CorpusException('Terms inference failed: ' + exc.message) def create_registry(self): """ Creates registry file """ store_registry( path=self.get_registry_path(), lang=self.language(), vertical_path=self.get_vertical_path(), compiled_path=self.get_compiled_corpus_path()) def compile_corpus(self): """ Compiles given corpora """ task = Popen(('compilecorp', '--recompile-corpus', self.get_registry_path(), self.get_vertical_path())) task.wait() if task.returncode != 0: raise CorpusException('Compilation failed.') logging.info('Corpus compiled:' + self.get_compiled_corpus_path()) def check_corpus(self): """Prints compiled corpus status generated by corpcheck. """ task = Popen(('corpcheck', self.get_registry_path())) task.wait() if task.returncode != 0: raise CorpusException('Compiled corpus checking failed.') def print_concordances(self, query): """Prints concordances of a given query @param query: query string in CQL """ call(('corpquery', self.get_registry_path(), query, '-h', '10', # limit of 10 results, '-c', '10', # left and right context of 10 words '-a', 'word', # only show words in the result '-s', 'p,doc')) # only show p and doc structures def print_info(self): """ Returns corpus summary """ # check if corpus exists verticals_path = self.get_uncompiled_corpus_path() if not os.listdir(verticals_path): print 'Corpus %s does not exist.' % self.get_corpus_name() return print 'Corpus name:', self.get_corpus_name() # verticals print 'Vertical files:', verticals_path call(('ls', '-lhtcr', verticals_path)) # registry registry_path = self.get_registry_path() if os.path.isfile(registry_path): print 'Registry:', registry_path else: print 'Registry: no' # if there is no registry file, it can't be compiled return # compilation compiled_path = self.get_compiled_corpus_path() print 'Compiled:', if os.listdir(compiled_path): print compiled_path else: print 'no' # ------------------------------------------------------------------------ # private methods # ------------------------------------------------------------------------ @contextmanager def _open_dump(self): """Opened dump (prepared for reading) with statement manager Allows to write: with self._open_dump() as dump_file: do something And dump will be closed automatically no matter what. """ dump_path = self.get_dump_path() try: # open dump if self.is_dump_compressed(): dump_file = bz2.BZ2File(dump_path, 'r') else: dump_file = open(dump_path) try: yield dump_file # [after yield, the body of with statement will be executed] finally: dump_file.close() except IOError as exc: # errno.ENOENT = "No such file or directory" if exc.errno == errno.ENOENT: raise CorpusException('Dump file {name} doesn\'t exist.' .format(name=dump_path)) # ------------------------------------------------------------------------ # magic methods # ------------------------------------------------------------------------ def __str__(self): return unicode(self).encode('utf-8') def __repr__(self): return 'WikiCorpus({lang})'.format(lang=self.language()) def __unicode__(self): return repr(self)