Esempio n. 1
0
def main():
    script_location = os.path.dirname(__file__)
    for link, metainfo in target_texts.items():
        wordcount = {}
        name = metainfo[0] + " - " + metainfo[1]
        folder = os.path.join(
            script_location, "corpus", name
        )  #os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van /
        cleaned_file = os.path.join(folder, "cleaned.txt")
        # file opener
        with open(cleaned_file) as cleaned_text:
            for line in cleaned_text:
                stripped = line.strip()
                splitted = stripped.split(" ")
                for word in splitted:
                    if word in wordcount:
                        wordcount[word] += 1
                    else:
                        wordcount[word] = 1
        print(name)
        sorted_wordcount = sorted(wordcount,
                                  key=lambda woord: wordcount[woord],
                                  reverse=True)
        resultfile_path = os.path.join(folder, "wordcount.csv")
        if os.path.exists(resultfile_path):
            os.remove(resultfile_path)
        with open(resultfile_path, 'w') as resultfile:
            resultwriter = csv.writer(resultfile, delimiter=",")
            resultwriter.writerow(["woord", "frequency"])
            for woord in sorted_wordcount:
                resultwriter.writerow([woord, wordcount[woord]])
Esempio n. 2
0
def main():
    script_location = os.path.dirname(__file__)
    for link, metainfo in target_texts.items():
        name = metainfo[0] + " - " + metainfo[1]
        # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van /
        folder = os.path.join(
            script_location, "corpus",
            name)  # voeg de map corpus/textnaam toe aan huidige pad
        tfidf = os.path.join(folder, "tfidf.csv")
        imagepath = os.path.join(folder, "wordcloud.png")
        frequencies = {}
        with open(tfidf) as csvfile:
            reader = csv.reader(csvfile)
            next(reader)
            for word, tfidf in reader:
                frequencies[word] = float(tfidf)
        if len(frequencies) == 0:
            continue
        # Generate a word cloud image
        wordcloud = WordCloud(width=800,
                              height=400,
                              mode='RGBA',
                              background_color='white',
                              colormap='inferno').fit_words(frequencies)

        # The pil way (if you don't have matplotlib)
        image = wordcloud.to_image()
        if os.path.exists(imagepath):
            os.remove(imagepath)
        image.save(imagepath, 'PNG')
def main():
    script_location = os.path.dirname(__file__)
    print("starting scraping!")
    # modify to set scraping targets as a dict in following structure
    # tuftslink: [auteur, title, year, genre, filterfunction]
    # filterfunction is a function string -> bool that can apply aditional filters
    for link, metainfo in target_texts.items():
        print("Scraping target {}".format(metainfo[0] + " - " + metainfo[1]))

        folder = os.path.join(
            script_location, "corpus", metainfo[0] + " - " + metainfo[1]
        )  #os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van /
        if os.path.exists(folder):  # als de folder al bestaat niks doen!
            print("[WARNING] found a folder {} so skipping this site.".format(
                folder))
            continue  # dit laat de for loop doorgaan zonder dit item verder te verwerken

        print("scraping: {}".format(link))
        if metainfo[4]:
            text = scrapeText(link, metainfo[4])
        else:
            text = scrapeText(link)
        os.makedirs(folder, exist_ok=True)

        text_file_path = os.path.join(folder, "text.txt")
        text_file = open(text_file_path, "w")
        text_file.write(text)
        text_file.close()

        meta_file_path = os.path.join(folder, "meta.txt")
        meta_file = open(meta_file_path, "w")
        meta_file.write("\n".join(metainfo[:-1]))  #remove the filter here
        meta_file.close()
        print("finished scraping: {}".format(link))
def inverseDocumentFrequency(word, target_texts):
    N_docs = len(target_texts)
    N_docs_with_word = 0
    script_location = os.path.dirname(__file__)
    for link, metainfo in target_texts.items():
        name = metainfo[0] + " - " + metainfo[1]
        # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van /
        folder = os.path.join(script_location, "corpus",  name) # voeg de map corpus/textnaam toe aan huidige pad 
        context = os.path.join(folder, "contextcount.csv") # voeg hier weer contextcount.csv aan toe
        with open(context) as csvfile:  # open neemt als argument een String (Filename)
            contextreader = csv.reader(csvfile)
            for key, frequency in contextreader:
                if word == key:
                    N_docs_with_word += 1       
    return math.log(N_docs/N_docs_with_word)
def main():
    script_location = os.path.dirname(__file__)

    for link, metainfo in target_texts.items():
        folder = os.path.join(
            script_location, "corpus", metainfo[0] + " - " + metainfo[1]
        )  #os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van /
        name = metainfo[0] + " - " + metainfo[1]
        print("Cleaning text for: {}".format(name))
        inputfile = os.path.join(folder, "text.txt")
        outputfile = os.path.join(folder, "cleaned.txt")
        if os.path.isfile(outputfile):
            os.remove(outputfile)
            print(
                "Text '{}' already cleaned, removing cleaned.txt to re-clean".
                format(name))
            cleanText(inputfile, outputfile)
Esempio n. 6
0
def main():
    # Geeft directoryname van huidige script en slaat op in script_location
    script_location = os.path.dirname(__file__)
    targetlist = ['pan', 'mountain', 'agora']
    contextlength = 50  # IDEE: contextlength afhankelijk van textlength (in procenten)
    for link, metainfo in target_texts.items(
    ):  # Geeft Key, value paren in list
        # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van /
        folder = os.path.join(script_location, "corpus",
                              metainfo[0] + " - " + metainfo[1])
        name = metainfo[0] + " - " + metainfo[1]  # Geeft auteur en titel
        print("Generating context data for: {}".format(name))
        inputfile = os.path.join(folder, "cleaned.txt")
        outputfile = os.path.join(folder, "context.csv")
        if os.path.isfile(outputfile):
            os.remove(outputfile)
            print(
                "Text '{}' already contextualised, removing context.csv to re-analyse"
                .format(name))
        contextualiseText(inputfile, outputfile, targetlist, contextlength)
Esempio n. 7
0
def main():
    script_location = os.path.dirname(__file__)
    for link, metainfo in target_texts.items():
        contextcount = {}
        name = metainfo[0] + " - " + metainfo[1]
        # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van /
        folder = os.path.join(
            script_location, "corpus",
            name)  # voeg de map corpus/textnaam toe aan huidige pad
        context = os.path.join(
            folder, "context.csv")  # voeg hier weer context.csv aan toe
        # file opener
        with open(
                context
        ) as csvfile:  # open neemt als argument een String (Filename)
            contextreader = csv.DictReader(csvfile)
            for line in contextreader:
                contexttext = line['context']
                stripped = contexttext.strip()
                splitted = stripped.split(" ")
                for word in splitted:
                    if word in contextcount:
                        contextcount[word] += 1
                    else:
                        contextcount[word] = 1
        sorted_contextcount = sorted(contextcount,
                                     key=lambda woord: contextcount[woord],
                                     reverse=True)
        nwords = 0
        resultfile_path = os.path.join(folder, "contextcount.csv")
        if os.path.exists(resultfile_path):
            os.remove(resultfile_path)
        with open(resultfile_path, 'w') as resultfile:
            resultwriter = csv.writer(resultfile, delimiter=",")
            resultwriter.writerow(["woord", "frequency"])
            for woord in sorted_contextcount:
                resultwriter.writerow([woord, contextcount[woord]])
                nwords += contextcount[woord]
def main():
    script_location = os.path.dirname(__file__)
    for link, metainfo in target_texts.items():
        frequencies = {}
        name = metainfo[0] + " - " + metainfo[1]
        # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van /
        folder = os.path.join(script_location, "corpus",  name) # voeg de map corpus/textnaam toe aan huidige pad 
        context = os.path.join(folder, "contextcount.csv") # voeg hier weer contextcount.csv aan toe
        
        total_wordcount = 0 # totaal aantal woorden in context
        with open(context) as csvfile:  # open neemt als argument een String (Filename)
            contextreader = csv.reader(csvfile)
            next(contextreader) # skip de eerste rij
            for key, value in contextreader:
                total_wordcount += int(value)
        
        # tf-idf berekenen
        with open(context) as csvfile:  # open neemt als argument een String (Filename)
            contextreader = csv.reader(csvfile)
            next(contextreader) # skip de eerste rij
            for key, value in contextreader: # line = lijst van [key, value]
                frequencies[key] = (int(value)/total_wordcount) * inverseDocumentFrequency(key, target_texts) 
        
        sorted_frequencies = sorted(
            frequencies, key=lambda word: frequencies[word], reverse=True)
        # for k in sorted_frequencies:
        #     print(f"{k} -> {frequencies[k]}")
        # print(sorted_frequencies)

        resultfile_path = os.path.join(folder, "tfidf.csv")
        if os.path.exists(resultfile_path):
            os.remove(resultfile_path)
        with open(resultfile_path, 'w') as resultfile:
            resultwriter = csv.writer(resultfile, delimiter=",")
            resultwriter.writerow(["woord", "tfidf"])
            for woord in sorted_frequencies:
                resultwriter.writerow([woord, frequencies[woord]])
def main():
    script_location = os.path.dirname(__file__)
    data = []
    for link, metainfo in target_texts.items():
        name = metainfo[0] + " - " + metainfo[1]
        # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van /
        folder = os.path.join(
            script_location, "corpus",
            name)  # voeg de map corpus/textnaam toe aan huidige pad
        tfidf = os.path.join(folder, "tfidf.csv")
        words = []
        tfidfs = []
        with open(tfidf) as csvfile:
            reader = csv.reader(csvfile)
            next(reader)
            for woord, tfidf in reader:
                words.append(woord)
                tfidfs.append(tfidf)

        cutoff_percentage = 90
        cutoff_index = int((len(words) / 100) * cutoff_percentage)

        words = words[:cutoff_index]
        tfidfs = tfidfs[:cutoff_index]

        words.reverse()
        tfidfs.reverse()
        # https://plot.ly/python/reference/#bar
        barvis = go.Bar(name=name, x=tfidfs, y=words, orientation='h')

        # Per document
        # data = [barvis]
        # py.plot(data, filename = name, auto_open=True)
        # data=[]
        # all together
        data.append(barvis)
    py.plot(data, filename="TF-IDF for context", auto_open=True)