def main(): script_location = os.path.dirname(__file__) for link, metainfo in target_texts.items(): wordcount = {} name = metainfo[0] + " - " + metainfo[1] folder = os.path.join( script_location, "corpus", name ) #os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van / cleaned_file = os.path.join(folder, "cleaned.txt") # file opener with open(cleaned_file) as cleaned_text: for line in cleaned_text: stripped = line.strip() splitted = stripped.split(" ") for word in splitted: if word in wordcount: wordcount[word] += 1 else: wordcount[word] = 1 print(name) sorted_wordcount = sorted(wordcount, key=lambda woord: wordcount[woord], reverse=True) resultfile_path = os.path.join(folder, "wordcount.csv") if os.path.exists(resultfile_path): os.remove(resultfile_path) with open(resultfile_path, 'w') as resultfile: resultwriter = csv.writer(resultfile, delimiter=",") resultwriter.writerow(["woord", "frequency"]) for woord in sorted_wordcount: resultwriter.writerow([woord, wordcount[woord]])
def main(): script_location = os.path.dirname(__file__) for link, metainfo in target_texts.items(): name = metainfo[0] + " - " + metainfo[1] # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van / folder = os.path.join( script_location, "corpus", name) # voeg de map corpus/textnaam toe aan huidige pad tfidf = os.path.join(folder, "tfidf.csv") imagepath = os.path.join(folder, "wordcloud.png") frequencies = {} with open(tfidf) as csvfile: reader = csv.reader(csvfile) next(reader) for word, tfidf in reader: frequencies[word] = float(tfidf) if len(frequencies) == 0: continue # Generate a word cloud image wordcloud = WordCloud(width=800, height=400, mode='RGBA', background_color='white', colormap='inferno').fit_words(frequencies) # The pil way (if you don't have matplotlib) image = wordcloud.to_image() if os.path.exists(imagepath): os.remove(imagepath) image.save(imagepath, 'PNG')
def main(): script_location = os.path.dirname(__file__) print("starting scraping!") # modify to set scraping targets as a dict in following structure # tuftslink: [auteur, title, year, genre, filterfunction] # filterfunction is a function string -> bool that can apply aditional filters for link, metainfo in target_texts.items(): print("Scraping target {}".format(metainfo[0] + " - " + metainfo[1])) folder = os.path.join( script_location, "corpus", metainfo[0] + " - " + metainfo[1] ) #os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van / if os.path.exists(folder): # als de folder al bestaat niks doen! print("[WARNING] found a folder {} so skipping this site.".format( folder)) continue # dit laat de for loop doorgaan zonder dit item verder te verwerken print("scraping: {}".format(link)) if metainfo[4]: text = scrapeText(link, metainfo[4]) else: text = scrapeText(link) os.makedirs(folder, exist_ok=True) text_file_path = os.path.join(folder, "text.txt") text_file = open(text_file_path, "w") text_file.write(text) text_file.close() meta_file_path = os.path.join(folder, "meta.txt") meta_file = open(meta_file_path, "w") meta_file.write("\n".join(metainfo[:-1])) #remove the filter here meta_file.close() print("finished scraping: {}".format(link))
def inverseDocumentFrequency(word, target_texts): N_docs = len(target_texts) N_docs_with_word = 0 script_location = os.path.dirname(__file__) for link, metainfo in target_texts.items(): name = metainfo[0] + " - " + metainfo[1] # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van / folder = os.path.join(script_location, "corpus", name) # voeg de map corpus/textnaam toe aan huidige pad context = os.path.join(folder, "contextcount.csv") # voeg hier weer contextcount.csv aan toe with open(context) as csvfile: # open neemt als argument een String (Filename) contextreader = csv.reader(csvfile) for key, frequency in contextreader: if word == key: N_docs_with_word += 1 return math.log(N_docs/N_docs_with_word)
def main(): script_location = os.path.dirname(__file__) for link, metainfo in target_texts.items(): folder = os.path.join( script_location, "corpus", metainfo[0] + " - " + metainfo[1] ) #os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van / name = metainfo[0] + " - " + metainfo[1] print("Cleaning text for: {}".format(name)) inputfile = os.path.join(folder, "text.txt") outputfile = os.path.join(folder, "cleaned.txt") if os.path.isfile(outputfile): os.remove(outputfile) print( "Text '{}' already cleaned, removing cleaned.txt to re-clean". format(name)) cleanText(inputfile, outputfile)
def main(): # Geeft directoryname van huidige script en slaat op in script_location script_location = os.path.dirname(__file__) targetlist = ['pan', 'mountain', 'agora'] contextlength = 50 # IDEE: contextlength afhankelijk van textlength (in procenten) for link, metainfo in target_texts.items( ): # Geeft Key, value paren in list # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van / folder = os.path.join(script_location, "corpus", metainfo[0] + " - " + metainfo[1]) name = metainfo[0] + " - " + metainfo[1] # Geeft auteur en titel print("Generating context data for: {}".format(name)) inputfile = os.path.join(folder, "cleaned.txt") outputfile = os.path.join(folder, "context.csv") if os.path.isfile(outputfile): os.remove(outputfile) print( "Text '{}' already contextualised, removing context.csv to re-analyse" .format(name)) contextualiseText(inputfile, outputfile, targetlist, contextlength)
def main(): script_location = os.path.dirname(__file__) for link, metainfo in target_texts.items(): contextcount = {} name = metainfo[0] + " - " + metainfo[1] # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van / folder = os.path.join( script_location, "corpus", name) # voeg de map corpus/textnaam toe aan huidige pad context = os.path.join( folder, "context.csv") # voeg hier weer context.csv aan toe # file opener with open( context ) as csvfile: # open neemt als argument een String (Filename) contextreader = csv.DictReader(csvfile) for line in contextreader: contexttext = line['context'] stripped = contexttext.strip() splitted = stripped.split(" ") for word in splitted: if word in contextcount: contextcount[word] += 1 else: contextcount[word] = 1 sorted_contextcount = sorted(contextcount, key=lambda woord: contextcount[woord], reverse=True) nwords = 0 resultfile_path = os.path.join(folder, "contextcount.csv") if os.path.exists(resultfile_path): os.remove(resultfile_path) with open(resultfile_path, 'w') as resultfile: resultwriter = csv.writer(resultfile, delimiter=",") resultwriter.writerow(["woord", "frequency"]) for woord in sorted_contextcount: resultwriter.writerow([woord, contextcount[woord]]) nwords += contextcount[woord]
def main(): script_location = os.path.dirname(__file__) for link, metainfo in target_texts.items(): frequencies = {} name = metainfo[0] + " - " + metainfo[1] # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van / folder = os.path.join(script_location, "corpus", name) # voeg de map corpus/textnaam toe aan huidige pad context = os.path.join(folder, "contextcount.csv") # voeg hier weer contextcount.csv aan toe total_wordcount = 0 # totaal aantal woorden in context with open(context) as csvfile: # open neemt als argument een String (Filename) contextreader = csv.reader(csvfile) next(contextreader) # skip de eerste rij for key, value in contextreader: total_wordcount += int(value) # tf-idf berekenen with open(context) as csvfile: # open neemt als argument een String (Filename) contextreader = csv.reader(csvfile) next(contextreader) # skip de eerste rij for key, value in contextreader: # line = lijst van [key, value] frequencies[key] = (int(value)/total_wordcount) * inverseDocumentFrequency(key, target_texts) sorted_frequencies = sorted( frequencies, key=lambda word: frequencies[word], reverse=True) # for k in sorted_frequencies: # print(f"{k} -> {frequencies[k]}") # print(sorted_frequencies) resultfile_path = os.path.join(folder, "tfidf.csv") if os.path.exists(resultfile_path): os.remove(resultfile_path) with open(resultfile_path, 'w') as resultfile: resultwriter = csv.writer(resultfile, delimiter=",") resultwriter.writerow(["woord", "tfidf"]) for woord in sorted_frequencies: resultwriter.writerow([woord, frequencies[woord]])
def main(): script_location = os.path.dirname(__file__) data = [] for link, metainfo in target_texts.items(): name = metainfo[0] + " - " + metainfo[1] # os.join voegt 'slim' pathnames samen. Omdat op windows gebruik word gemaakt van \ en niet van / folder = os.path.join( script_location, "corpus", name) # voeg de map corpus/textnaam toe aan huidige pad tfidf = os.path.join(folder, "tfidf.csv") words = [] tfidfs = [] with open(tfidf) as csvfile: reader = csv.reader(csvfile) next(reader) for woord, tfidf in reader: words.append(woord) tfidfs.append(tfidf) cutoff_percentage = 90 cutoff_index = int((len(words) / 100) * cutoff_percentage) words = words[:cutoff_index] tfidfs = tfidfs[:cutoff_index] words.reverse() tfidfs.reverse() # https://plot.ly/python/reference/#bar barvis = go.Bar(name=name, x=tfidfs, y=words, orientation='h') # Per document # data = [barvis] # py.plot(data, filename = name, auto_open=True) # data=[] # all together data.append(barvis) py.plot(data, filename="TF-IDF for context", auto_open=True)