def parse_and_preprocess_src(data_source, corpus_destination, preprocess=True):
    if re.search("bundestag", data_source.lower()):
        name = "bundestag"
        raw_corpus = DataHandler.get_bundestag_speeches(directory=data_source)
    elif re.search("sustainability", data_source.lower()):
        name = "sustainability"
        raw_corpus = DataHandler.get_sustainability_data(path=data_source)
    elif re.search("unv1.0-tei", data_source.lower()):
        name = "united_nations"
        raw_corpus = DataHandler.get_un_texts(directory=data_source)
    elif re.search("state_of_the_union", data_source.lower()):
        name = "state_of_the_union"
        raw_corpus = DataHandler.get_state_of_the_union(directory=data_source)
    else:
        name = "abstracts"
        raw_corpus = DataHandler.get_abstracts(path=data_source)

    language = raw_corpus[0].language
    print('loaded', len(raw_corpus), 'documents')
    if preprocess:
        Preprocessor.preprocess(raw_corpus, language=language)
        print('preprocessed', len(raw_corpus), 'documents')
    corpus = Corpus(source=raw_corpus, language=language, name=name)
    print('parsed', len(corpus.get_documents(as_list=True)),
          'documents to a Corpus')
    corpus.save_corpus(corpus_destination)
def cleaning_authors(config, overwrite=False):
    corpus_names = [
        "bundestag_corpus",
        # "sustainability_corpus",
        # "abstract_corpus"
    ]
    languages = [Language.DE, Language.EN, Language.EN]
    wlc = 0
    m_a = 0
    s_a = 0
    for i, corpus_name in enumerate(corpus_names):
        corpus = Corpus(source=config["corpora"][corpus_name],
                        language=languages[i],
                        name=corpus_name)
        # corpus = DataHandler.load_corpus(config["corpora"][corpus_name])
        for d in corpus.get_documents():
            if d.author:
                if isinstance(d.author, float) and np.isnan(d.author):
                    d.author = None
                else:
                    if corpus_name == "bundestag_corpus":
                        authors = [d.author]
                    elif corpus_name == "sustainability_corpus":
                        if isinstance(d.author, str):
                            authors = [a.strip() for a in d.author.split(',')]
                            authors = [
                                f'{j}. {i}'
                                for i, j in zip(authors[::2], authors[1::2])
                            ]
                        else:
                            authors = d.author
                    else:
                        if d.language != "English":
                            wlc += 1
                            continue
                        if isinstance(d.author, str):
                            authors = [a.strip() for a in d.author.split(',')]
                            authors = [
                                f'{j}. {i}'
                                for i, j in zip(authors[::2], authors[1::2])
                            ]
                        else:
                            authors = d.author
                        if len(authors) > 1:
                            m_a += 1
                            print(d.author, authors)
                        else:
                            s_a += 1
                    d.author = authors

        if not overwrite:
            os.rename(src=config["corpora"][corpus_name],
                      dst=create_new_filepath_uncleaned(
                          config["corpora"][corpus_name]))

        corpus.save_corpus(config["corpora"][corpus_name])
    print(wlc, m_a, s_a)
def cleaning_punctuation(config, overwrite=False):
    corpus_names = [
        "bundestag_corpus", "sustainability_corpus", "abstract_corpus"
    ]
    languages = [Language.DE, Language.EN, Language.EN]
    for i, corpus_name in enumerate(corpus_names):
        corpus = Corpus(source=config["corpora"][corpus_name],
                        language=languages[i],
                        name=corpus_name)
        remove_punctuation(corpus)

        if not overwrite:
            os.rename(src=config["corpora"][corpus_name],
                      dst=create_new_filepath_uncleaned(
                          config["corpora"][corpus_name]))

        corpus.save_corpus(config["corpora"][corpus_name])
def cleaning_un(config, overwrite=True):
    corpus = Corpus(source=config["corpora"]["united_nations_corpus"],
                    language=Language.DE,
                    name="united_nations_corpus")
    corpus = Corpus(source=[d for d in corpus.get_documents() if d.date],
                    language=corpus.language,
                    name=corpus.name)
    print("1", len(corpus))
    for d in corpus.get_documents():
        d.date = int(d.date)
    print("2", len(corpus))

    if not overwrite:
        os.rename(src=config["corpora"]["united_nations_corpus"],
                  dst=create_new_filepath_uncleaned(
                      config["corpora"]["united_nations_corpus"]))

    corpus.save_corpus(config["corpora"]["united_nations_corpus"])
def cleaning_bundestag(config, overwrite=True):
    corpus = Corpus(source=config["corpora"]["bundestag_corpus"],
                    language=Language.DE,
                    name="bundestag_corpus")
    # corpus = DataHandler.load_corpus(config["corpora"]["bundestag_corpus"])
    corpus = Corpus(source=[d for d in corpus.get_documents() if d.date],
                    language=corpus.language,
                    name=corpus.name)
    print("1", len(corpus))
    for d in corpus.get_documents():
        d.date = int(d.date)
    print("2", len(corpus))

    if not overwrite:
        os.rename(src=config["corpora"]["bundestag_corpus"],
                  dst=create_new_filepath_uncleaned(
                      config["corpora"]["bundestag_corpus"]))

    corpus.save_corpus(config["corpora"]["bundestag_corpus"])
def cleaning_abstracts(config, overwrite=True):
    corpus = Corpus(source=config["corpora"]["abstract_corpus"],
                    language=Language.EN,
                    name="abstract_corpus")
    # corpus = DataHandler.load_corpus(config["corpora"]["abstract_corpus"])
    print("1", len(corpus))
    corpus = Corpus([
        d for d in corpus.get_documents()
        if d.date and len(str(d.date)) == 4 and d.date.isnumeric()
    ],
                    name=corpus.name,
                    language=Language.EN)
    for d in corpus.get_documents():
        d.date = int(d.date)
    print("2", len(corpus))

    if not overwrite:
        os.rename(src=config["corpora"]["abstract_corpus"],
                  dst=create_new_filepath_uncleaned(
                      config["corpora"]["abstract_corpus"]))

    corpus.save_corpus(config["corpora"]["abstract_corpus"])