def install_json(jsonfile: ExportInput = ExportInput("[metadata.id].json"), out: OutputCommonData = OutputCommonData( "sbx_metadata.install_json_export_marker"), export_path: str = Config("sbx_metadata.json_export_path"), host: str = Config("sbx_metadata.json_export_host")): """Copy JSON metadata to remote host.""" if not host: raise util.SparvErrorMessage( "'sbx_metadata.json_export_host' not set! JSON export not installed." ) filename = Path(jsonfile).name remote_file_path = os.path.join(export_path, filename) util.install_file(host, jsonfile, remote_file_path) out.write("")
def install_metashare( xmlfile: ExportInput = ExportInput("sbx_metadata/[metadata.id].xml"), out: OutputCommonData = OutputCommonData( "sbx_metadata.install_metashare_marker"), export_path: str = Config("sbx_metadata.metashare_path"), host: str = Config("sbx_metadata.metashare_host")): """Copy META-SHARE file to remote host.""" if not host: raise util.SparvErrorMessage( "'sbx_metadata.metashare_host' not set! META-SHARE export not installed." ) filename = Path(xmlfile).name remote_file_path = os.path.join(export_path, filename) util.install_file(host, xmlfile, remote_file_path) out.write("")
def install_original(corpus: Corpus = Corpus(), xmlfile: ExportInput = ExportInput("[xml_export.filename_compressed]"), out: OutputCommonData = OutputCommonData("xml_export.install_export_pretty_marker"), export_path: str = Config("xml_export.export_original_path"), host: str = Config("xml_export.export_original_host")): """Copy compressed combined unscrambled XML to remote host.""" xml_utils.install_compressed_xml(corpus, xmlfile, out, export_path, host)
def install_relations( sqlfile: ExportInput = ExportInput("korp_wordpicture/relations.sql"), out: OutputCommonData = OutputCommonData( "korp.install_relations_marker"), db_name: str = Config("korp.mysql_dbname"), host: str = Config("korp.remote_host")): """Install Korp's Word Picture SQL on remote host. Args: sqlfile (str, optional): SQL file to be installed. Defaults to ExportInput("korp_wordpicture/relations.sql"). out (str, optional): Marker file to be written. db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname"). host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host"). """ util.install_mysql(host, db_name, sqlfile) out.write("")
def install_lemgrams(sqlfile: ExportInput = ExportInput( "korp_lemgram_index/lemgram_index.sql"), marker: OutputCommonData = OutputCommonData( "korp.install_lemgram_marker"), db_name: str = Config("korp.mysql_dbname"), host: str = Config("korp.remote_host")): """Install lemgram SQL on remote host. Args: sqlfile (str, optional): SQL file to be installed. Defaults to ExportInput("korp_lemgram_index/lemgram_index.sql"). marker (str, optional): Marker file to be written. Defaults to OutputCommonData("korp.install_lemgram_marker"). db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname"). host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host"). """ util.install_mysql(host, db_name, sqlfile) marker.write("")
def info_sentences( out: OutputCommonData = OutputCommonData("cwb.sentencecount"), sentence: AnnotationAllDocs = AnnotationAllDocs("<sentence>"), docs: AllDocuments = AllDocuments()): """Determine how many sentences there are in the corpus.""" # Read sentence annotation and count the sentences sentence_count = 0 for doc in docs: try: sentence_count += len(list(sentence.read_spans(doc))) except FileNotFoundError: pass if sentence_count == 0: log.info("No sentence information found in corpus") # Write sentencecount data out.write(str(sentence_count))
def resolution( out_resolution: OutputCommonData = OutputCommonData( "dateformat.resolution"), informat: Optional[str] = Config("dateformat.datetime_informat")): """Get the datetime resolution from the informat defined in the corpus config. Args: out_resolution: Date format output. informat: Date in-format, used to calculate date resolution. """ resolutions = [] if informat: informats = informat.strip("|").split("|") for i in informats: res = [] if any(s in i for s in ["%Y", "%y"]): res.append("Y") if any(s in i for s in ["%b", "%B", "%m"]): res.append("M") if any(s in i for s in ["%a", "%A", "%w", "%d"]): res.append("D") if any(s in i for s in ["%H", "%I"]): res.append("h") if "%M" in i: res.append("m") if "%S" in i: res.append("s") resolutions.append("".join(res)) # Sort with more fine-grained resolutions first resolutions.sort(key=len, reverse=True) resolutions = "|".join(resolutions) # Write time resolution file out_resolution.write(resolutions)
def info_date_unknown( out_datefirst: OutputCommonData = OutputCommonData("cwb.datefirst"), out_datelast: OutputCommonData = OutputCommonData("cwb.datelast")): """Create empty datefirst and datelast file (needed for .info file) if corpus has no date information.""" log.info("No date information found in corpus") # Write datefirst and datelast files out_datefirst.write("") out_datelast.write("")
def info_date( corpus: Corpus = Corpus(), out_datefirst: OutputCommonData = OutputCommonData("cwb.datefirst"), out_datelast: OutputCommonData = OutputCommonData("cwb.datelast"), corpus_data_file: ExportInput = ExportInput( "[cwb.corpus_registry]/[metadata.id]"), datefrom: AnnotationAllDocs = AnnotationAllDocs( "[dateformat.out_annotation]:dateformat.datefrom"), dateto: AnnotationAllDocs = AnnotationAllDocs( "[dateformat.out_annotation]:dateformat.dateto"), timefrom: AnnotationAllDocs = AnnotationAllDocs( "[dateformat.out_annotation]:dateformat.timefrom"), timeto: AnnotationAllDocs = AnnotationAllDocs( "[dateformat.out_annotation]:dateformat.timeto"), remove_namespaces: bool = Config("export.remove_module_namespaces", False), cwb_bin_path: Config = Config("cwb.bin_path", ""), registry: str = Config("cwb.corpus_registry")): """Create datefirst and datelast file (needed for .info file).""" def fix_name(name: str): """Remove invalid characters from annotation names and optionally remove namespaces.""" if remove_namespaces: prefix, part, suffix = name.partition(":") suffix = suffix.split(".")[-1] name = prefix + part + suffix return name.replace(":", "_") def _parse_cwb_output(output): lines = output.decode("utf8").split("\n") values = [ "%s %s" % (line.split("\t")[1], line.split("\t")[2]) for line in lines if line.split("\t")[-1] ] # Fix dates with less than 8 digits (e.g. 800 --> 0800), needed by strptime values = [ "%s %s" % (v.split()[0].zfill(8), v.split()[1]) for v in values ] # Convert to dates and sort, then convert to human readable format values = sorted( [datetime.strptime(v, "%Y%m%d %H%M%S") for v in values]) return [v.strftime("%Y-%m-%d %H:%M:%S") for v in values] # Get date and time annotation names datefrom_name = fix_name(datefrom.name) timefrom_name = fix_name(timefrom.name) dateto_name = fix_name(dateto.name) timeto_name = fix_name(timeto.name) # Get datefirst and write to file datefirst_args = [ "-r", registry, "-q", corpus, datefrom_name, timefrom_name ] datefirst_out, _ = util.system.call_binary( os.path.join(cwb_bin_path, "cwb-scan-corpus"), datefirst_args) datefirst = _parse_cwb_output(datefirst_out)[0] out_datefirst.write(str(datefirst)) # Get datelast and write to file datelast_args = ["-r", registry, "-q", corpus, dateto_name, timeto_name] datelast_out, _ = util.system.call_binary( os.path.join(cwb_bin_path, "cwb-scan-corpus"), datelast_args) datelast = _parse_cwb_output(datelast_out)[-1] out_datelast.write(str(datelast))