Beispiel #1
0
def timespan_sql_with_dateinfo(
        corpus: Corpus = Corpus(),
        out: Export = Export("korp_timespan/timespan.sql"),
        docs: AllDocuments = AllDocuments(),
        token: AnnotationAllDocs = AnnotationAllDocs("<token>"),
        datefrom: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.datefrom"),
        dateto: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.dateto"),
        timefrom: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.timefrom"),
        timeto: AnnotationAllDocs = AnnotationAllDocs(
            "<text>:dateformat.timeto")):
    """Create timespan SQL data for use in Korp."""
    corpus_name = corpus.upper()
    datespans = defaultdict(int)
    datetimespans = defaultdict(int)

    for doc in docs:
        text_tokens, orphans = Annotation(datefrom.name,
                                          doc=doc).get_children(token)
        if orphans:
            datespans[("0" * 8, "0" * 8)] += len(orphans)
            datetimespans[("0" * 14, "0" * 14)] += len(orphans)
        dateinfo = datefrom.read_attributes(
            doc, (datefrom, dateto, timefrom, timeto))
        for text in text_tokens:
            d = next(dateinfo)
            datespans[(d[0].zfill(8), d[1].zfill(8))] += len(text)
            datetimespans[(d[0].zfill(8) + d[2].zfill(6),
                           d[1].zfill(8) + d[3].zfill(6))] += len(text)

    rows_date = []
    rows_datetime = []

    for span in datespans:
        rows_date.append({
            "corpus": corpus_name,
            "datefrom": span[0],
            "dateto": span[1],
            "tokens": datespans[span]
        })

    for span in datetimespans:
        rows_datetime.append({
            "corpus": corpus_name,
            "datefrom": span[0],
            "dateto": span[1],
            "tokens": datetimespans[span]
        })

    create_sql(corpus_name, out, rows_date, rows_datetime)
def freq_list_simple(corpus: Corpus = Corpus(),
                     docs: AllDocuments = AllDocuments(),
                     word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"),
                     pos: AnnotationAllDocs = AnnotationAllDocs("<token:pos>"),
                     baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"),
                     out: Export = Export("frequency_list/stats_[metadata.id].csv"),
                     delimiter: str = Config("stats_export.delimiter"),
                     cutoff: int = Config("stats_export.cutoff")):
    """Create a word frequency list for a corpus without sense, lemgram and complemgram annotations."""
    freq_dict = defaultdict(int)

    for doc in docs:
        simple_tokens = word.read_attributes(doc, [word, pos, baseform])

        # Add empty annotations for sense, lemgram and complemgram
        tokens = []
        for w, p, b in simple_tokens:
            tokens.append((w, p, b, "|", "|", "|"))
        update_freqs(tokens, freq_dict)

    write_csv(out, freq_dict, delimiter, cutoff)
def freq_list(corpus: Corpus = Corpus(),
              docs: AllDocuments = AllDocuments(),
              word: AnnotationAllDocs = AnnotationAllDocs("<token:word>"),
              msd: AnnotationAllDocs = AnnotationAllDocs("<token:msd>"),
              baseform: AnnotationAllDocs = AnnotationAllDocs("<token:baseform>"),
              sense: AnnotationAllDocs = AnnotationAllDocs("<token:sense>"),
              lemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.lemgram"),
              complemgram: AnnotationAllDocs = AnnotationAllDocs("<token>:saldo.complemgram"),
              out: Export = Export("frequency_list/stats_[metadata.id].csv"),
              delimiter: str = Config("stats_export.delimiter"),
              cutoff: int = Config("stats_export.cutoff"),
              include_all_compounds: bool = Config("stats_export.include_all_compounds")):
    """Create a word frequency list for the entire corpus.

    Args:
        corpus (str, optional): The corpus ID. Defaults to Corpus.
        docs (list, optional): The documents belonging to this corpus. Defaults to AllDocuments.
        word (str, optional): Word annotations. Defaults to AnnotationAllDocs("<token:word>").
        msd (str, optional): MSD annotations. Defaults to AnnotationAllDocs("<token:msd>").
        baseform (str, optional): Baseform annotations. Defaults to AnnotationAllDocs("<token:baseform>").
        sense (str, optional): Sense annotations. Defaults to AnnotationAllDocs("<token:sense>").
        lemgram (str, optional): Lemgram annotations. Defaults to AnnotationAllDocs("<token>:saldo.lemgram").
        complemgram (str, optional): Compound lemgram annotations.
            Defaults to AnnotationAllDocs("<token>:saldo.complemgram").
        out (str, optional): The output word frequency file. Defaults to Export("frequency_list/[metadata.id].csv").
        delimiter (str, optional): Column delimiter to use in the csv. Defaults to Config("stats_export.delimiter").
        cutoff (int, optional): The minimum frequency a word must have in order to be included in the result.
            Defaults to Config("stats_export.cutoff").
        include_all_compounds (bool, optional): Whether to include compound analyses for every word
            or just for the words that are lacking a sense annotation.
            Defaults to Config("stats_export.include_all_compounds").
    """
    freq_dict = defaultdict(int)

    for doc in docs:
        tokens = word.read_attributes(doc, [word, msd, baseform, sense, lemgram, complemgram])
        update_freqs(tokens, freq_dict, include_all_compounds)

    write_csv(out, freq_dict, delimiter, cutoff)