コード例 #1
0
def install_json(jsonfile: ExportInput = ExportInput("[metadata.id].json"),
                 out: OutputCommonData = OutputCommonData(
                     "sbx_metadata.install_json_export_marker"),
                 export_path: str = Config("sbx_metadata.json_export_path"),
                 host: str = Config("sbx_metadata.json_export_host")):
    """Copy JSON metadata to remote host."""
    if not host:
        raise util.SparvErrorMessage(
            "'sbx_metadata.json_export_host' not set! JSON export not installed."
        )
    filename = Path(jsonfile).name
    remote_file_path = os.path.join(export_path, filename)
    util.install_file(host, jsonfile, remote_file_path)
    out.write("")
コード例 #2
0
def install_metashare(
        xmlfile: ExportInput = ExportInput("sbx_metadata/[metadata.id].xml"),
        out: OutputCommonData = OutputCommonData(
            "sbx_metadata.install_metashare_marker"),
        export_path: str = Config("sbx_metadata.metashare_path"),
        host: str = Config("sbx_metadata.metashare_host")):
    """Copy META-SHARE file to remote host."""
    if not host:
        raise util.SparvErrorMessage(
            "'sbx_metadata.metashare_host' not set! META-SHARE export not installed."
        )
    filename = Path(xmlfile).name
    remote_file_path = os.path.join(export_path, filename)
    util.install_file(host, xmlfile, remote_file_path)
    out.write("")
コード例 #3
0
def install_original(corpus: Corpus = Corpus(),
                     xmlfile: ExportInput = ExportInput("[xml_export.filename_compressed]"),
                     out: OutputCommonData = OutputCommonData("xml_export.install_export_pretty_marker"),
                     export_path: str = Config("xml_export.export_original_path"),
                     host: str = Config("xml_export.export_original_host")):
    """Copy compressed combined unscrambled XML to remote host."""
    xml_utils.install_compressed_xml(corpus, xmlfile, out, export_path, host)
コード例 #4
0
def install_relations(
        sqlfile: ExportInput = ExportInput("korp_wordpicture/relations.sql"),
        out: OutputCommonData = OutputCommonData(
            "korp.install_relations_marker"),
        db_name: str = Config("korp.mysql_dbname"),
        host: str = Config("korp.remote_host")):
    """Install Korp's Word Picture SQL on remote host.

    Args:
        sqlfile (str, optional): SQL file to be installed. Defaults to ExportInput("korp_wordpicture/relations.sql").
        out (str, optional): Marker file to be written.
        db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname").
        host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host").
    """
    util.install_mysql(host, db_name, sqlfile)
    out.write("")
コード例 #5
0
def install_lemgrams(sqlfile: ExportInput = ExportInput(
    "korp_lemgram_index/lemgram_index.sql"),
                     marker: OutputCommonData = OutputCommonData(
                         "korp.install_lemgram_marker"),
                     db_name: str = Config("korp.mysql_dbname"),
                     host: str = Config("korp.remote_host")):
    """Install lemgram SQL on remote host.

    Args:
        sqlfile (str, optional): SQL file to be installed.
            Defaults to ExportInput("korp_lemgram_index/lemgram_index.sql").
        marker (str, optional): Marker file to be written.
            Defaults to OutputCommonData("korp.install_lemgram_marker").
        db_name (str, optional): Name of the data base. Defaults to Config("korp.mysql_dbname").
        host (str, optional): Remote host to install to. Defaults to Config("korp.remote_host").
    """
    util.install_mysql(host, db_name, sqlfile)
    marker.write("")
コード例 #6
0
ファイル: info.py プロジェクト: heatherleaf/sparv-pipeline
def info_sentences(
        out: OutputCommonData = OutputCommonData("cwb.sentencecount"),
        sentence: AnnotationAllDocs = AnnotationAllDocs("<sentence>"),
        docs: AllDocuments = AllDocuments()):
    """Determine how many sentences there are in the corpus."""
    # Read sentence annotation and count the sentences
    sentence_count = 0
    for doc in docs:
        try:
            sentence_count += len(list(sentence.read_spans(doc)))
        except FileNotFoundError:
            pass

    if sentence_count == 0:
        log.info("No sentence information found in corpus")

    # Write sentencecount data
    out.write(str(sentence_count))
コード例 #7
0
def resolution(
        out_resolution: OutputCommonData = OutputCommonData(
            "dateformat.resolution"),
        informat: Optional[str] = Config("dateformat.datetime_informat")):
    """Get the datetime resolution from the informat defined in the corpus config.

    Args:
        out_resolution: Date format output.
        informat: Date in-format, used to calculate date resolution.
    """
    resolutions = []

    if informat:
        informats = informat.strip("|").split("|")
        for i in informats:
            res = []
            if any(s in i for s in ["%Y", "%y"]):
                res.append("Y")
            if any(s in i for s in ["%b", "%B", "%m"]):
                res.append("M")
            if any(s in i for s in ["%a", "%A", "%w", "%d"]):
                res.append("D")
            if any(s in i for s in ["%H", "%I"]):
                res.append("h")
            if "%M" in i:
                res.append("m")
            if "%S" in i:
                res.append("s")
            resolutions.append("".join(res))

        # Sort with more fine-grained resolutions first
        resolutions.sort(key=len, reverse=True)

    resolutions = "|".join(resolutions)

    # Write time resolution file
    out_resolution.write(resolutions)
コード例 #8
0
ファイル: info.py プロジェクト: heatherleaf/sparv-pipeline
def info_date_unknown(
        out_datefirst: OutputCommonData = OutputCommonData("cwb.datefirst"),
        out_datelast: OutputCommonData = OutputCommonData("cwb.datelast")):
    """Create empty datefirst and datelast file (needed for .info file) if corpus has no date information."""
    log.info("No date information found in corpus")

    # Write datefirst and datelast files
    out_datefirst.write("")
    out_datelast.write("")
コード例 #9
0
ファイル: info.py プロジェクト: heatherleaf/sparv-pipeline
def info_date(
        corpus: Corpus = Corpus(),
        out_datefirst: OutputCommonData = OutputCommonData("cwb.datefirst"),
        out_datelast: OutputCommonData = OutputCommonData("cwb.datelast"),
        corpus_data_file: ExportInput = ExportInput(
            "[cwb.corpus_registry]/[metadata.id]"),
        datefrom: AnnotationAllDocs = AnnotationAllDocs(
            "[dateformat.out_annotation]:dateformat.datefrom"),
        dateto: AnnotationAllDocs = AnnotationAllDocs(
            "[dateformat.out_annotation]:dateformat.dateto"),
        timefrom: AnnotationAllDocs = AnnotationAllDocs(
            "[dateformat.out_annotation]:dateformat.timefrom"),
        timeto: AnnotationAllDocs = AnnotationAllDocs(
            "[dateformat.out_annotation]:dateformat.timeto"),
        remove_namespaces: bool = Config("export.remove_module_namespaces",
                                         False),
        cwb_bin_path: Config = Config("cwb.bin_path", ""),
        registry: str = Config("cwb.corpus_registry")):
    """Create datefirst and datelast file (needed for .info file)."""
    def fix_name(name: str):
        """Remove invalid characters from annotation names and optionally remove namespaces."""
        if remove_namespaces:
            prefix, part, suffix = name.partition(":")
            suffix = suffix.split(".")[-1]
            name = prefix + part + suffix
        return name.replace(":", "_")

    def _parse_cwb_output(output):
        lines = output.decode("utf8").split("\n")
        values = [
            "%s %s" % (line.split("\t")[1], line.split("\t")[2])
            for line in lines if line.split("\t")[-1]
        ]
        # Fix dates with less than 8 digits (e.g. 800 --> 0800), needed by strptime
        values = [
            "%s %s" % (v.split()[0].zfill(8), v.split()[1]) for v in values
        ]
        # Convert to dates and sort, then convert to human readable format
        values = sorted(
            [datetime.strptime(v, "%Y%m%d %H%M%S") for v in values])
        return [v.strftime("%Y-%m-%d %H:%M:%S") for v in values]

    # Get date and time annotation names
    datefrom_name = fix_name(datefrom.name)
    timefrom_name = fix_name(timefrom.name)
    dateto_name = fix_name(dateto.name)
    timeto_name = fix_name(timeto.name)

    # Get datefirst and write to file
    datefirst_args = [
        "-r", registry, "-q", corpus, datefrom_name, timefrom_name
    ]
    datefirst_out, _ = util.system.call_binary(
        os.path.join(cwb_bin_path, "cwb-scan-corpus"), datefirst_args)
    datefirst = _parse_cwb_output(datefirst_out)[0]
    out_datefirst.write(str(datefirst))

    # Get datelast and write to file
    datelast_args = ["-r", registry, "-q", corpus, dateto_name, timeto_name]
    datelast_out, _ = util.system.call_binary(
        os.path.join(cwb_bin_path, "cwb-scan-corpus"), datelast_args)
    datelast = _parse_cwb_output(datelast_out)[-1]
    out_datelast.write(str(datelast))