def main(): """ Here starts a classificator which makes categorization upon certain stems. """ print_own_info(__file__) xmlcollection = Collection() if exists_tfidf_matrix(xmlcollection, create=True) is True: process_project(get_tfidf_matrix_file(), xmlcollection)
def main(): print_own_info(__file__) yaml_file = get_graph_file() if exists(yaml_file): print "Graph (YAML) file exists: ",yaml_file print "Reading it ..." g = nx.read_yaml(yaml_file) d3_js.export_d3_js(g) print "Web files exported: ", else: create_graph()
""" from glob import glob from collections import defaultdict from sys import argv from codecs import open from re import sub from xml.sax.handler import ContentHandler from xml.sax import make_parser from xml.dom.minidom import Document as Doc from wh4t.library import print_own_info, get_def_enc, \ get_invalid_xml_filename, get_mailfolder print_own_info(__file__) def add_invalid_docs(filename, exceptstring): """ Add invalid documents by filename and exception string to the above prepared file. @param filename: String of Filename of the invalid XML document found. @param exceptstring: Exception string that lead to an error, including the position where the error was found. """ invalid_doc = invalid_xmldoc.createElement("invalid_document") invalid_doc_filename = invalid_xmldoc.createTextNode(filename) errstring = sub(filename + ":", "", exceptstring) invalid_doc.setAttribute("error", errstring) invalid_doc.appendChild(invalid_doc_filename)
def main(): """ This program makes a first exploration of all the input material we have, it prints out information like: - How big the input folder is (bytes) - How many raw text material is available (bytes), i. e. w/o meta-data - How many symbols are used - How many tokens, words, stems etc. are available TBD: - Add params to this program or make it more user-friendly / interactive. - Add more outcome, probably not only quantitative, but also qualitative information. - Put some of the (verbose) text into other classes. """ print_own_info(__file__) # Print total file size (=folder size) information of the # input material xmldocs = Collection() no_of_docs = len(xmldocs.get_docs()) print "-- Calculating total file size ..." print "Total file size: " + str(xmldocs.get_filesize()) + " bytes" print_line() # Print total raw text material information, being body text # of messages w/o meta-data rawsize = xmldocs.get_rawsize() print "-- Calculating raw size of text ..." print "Total raw size: " + str(rawsize) + " bytes" print "Avg raw size: " + str((rawsize / no_of_docs)) + " bytes" # Write all (body) text to a text file stdout.write("Write raw text into file: " + get_raw_file()) xmldocs.write_raw_text(in_one_file=True) print_ok() # - Write all unique symbols like "a", "ö" or "\", which are used, # into a file # - Give number of unique symbols employed stdout.write("Write symbols used into file: " + get_symbols_file()) syms = Symbols() syms.write_symbols() print_ok() print_line() print "-- Get unique symbols ..." print "Total number of unique symbols: " + str(syms.get_no_of_symbols()) print_line() # Print total numbers of tokens available; separation is done # by means of the Natural Language Toolkit (NLTK) # Problematic here: There are lots of non-linguistic tokens being # created, like URLs, at this stage. # That's why these tokens here are denoted as being "raw". print "-- Get tokens ..." tokenized_text = map(lambda x:x.lower(), xmldocs.get_tokens()) print "Total number of (raw) tokens: " + str(len(tokenized_text)) print "Avg number of (raw) tokens: " + \ str(len(tokenized_text)/no_of_docs) print_line() # - Print total number of unique tokens (=types); also here, lots # of "non-linguistic" types are preserved, ATM. # - Print also these raw types in lower case. print "-- Get types ..." typed_text = xmldocs.get_types() typed_text_lowered = xmldocs.get_types(lower=True) print "Total number of (raw) types: " + \ str(len(typed_text)) print "Total number of (raw) types (lower-cased): " + \ str(len(typed_text_lowered)) print "Avg number of (raw) types: " + \ str(len(typed_text)/no_of_docs) print "Avg number of (raw) types (lower-cased): " + \ str(len(typed_text_lowered)/no_of_docs) print_line() # - Print total number of words. These are "real" words; they # are very likely to be of linguistic nature, because they # were cleaned by the means of regexps -- constructed upon # observations made. print "-- Get number of words ..." words = xmldocs.get_words() words2 = set(words) print "Total number of words: " + \ str(len(words)) print "Total number of words2: " + \ str(len(words2)) print "Avg number of words: " + \ str(len(words)/no_of_docs) print_line() # - Get the subset of nouns from the words print "-- Get number of nouns ..." nouns = xmldocs.get_words(pos='n') print "Total number of nouns: " + \ str(len(nouns)) print "Avg number of nouns: " + \ str(len(nouns)/no_of_docs) print "Total number of (unique) nouns: " + \ str(len(set(nouns))) print "Avg number of (unique) nouns: " + \ str(len(set(nouns))/no_of_docs) print_line() # - Print total number of unique stems, which got created by NLTK # means, applied over words. print "-- Get number of stems ..." stemmed_text = xmldocs.get_stems() print "Total number of stems: " + \ str(len(stemmed_text)) print "Avg number of stems: " + \ str(len(stemmed_text)/no_of_docs) print_line() print "-- Get number of unique stems ..." stemmed_uniq_text = xmldocs.get_stems(uniq=True) print "Total number of unique stems: " + \ str(len(stemmed_uniq_text)) print "Avg number of stems: " + \ str(len(stemmed_text)/no_of_docs) print "Avg number of (unique) stems: " + \ str(len(stemmed_uniq_text)/no_of_docs) print_line() # Finally write some files, containing tokens, types, types in # lower case, words, stems and nouns. stdout.write("Write tokens into file: " + get_tokens_file()) xmldocs.write_tokens() print_ok() stdout.write("Write types into file: " + get_types_file()) xmldocs.write_types() print_ok() stdout.write("Write types (lowered) into file: " + get_types_file(lower=True)) xmldocs.write_types(lower=True) print_ok() stdout.write("Write words into file: " + get_words_file()) xmldocs.write_words() print_ok() stdout.write("Write stems (unique) into file: " + get_stems_file()) xmldocs.write_stems() print_ok() stdout.write("Write nouns into file: " + get_words_file(pos='n')) xmldocs.write_words(pos='n') print_ok() print_line() # Print the 42 most frequent words -- Zipf's law turns true ;-) print "Top 42 words (most frequent): " for stem in xmldocs.get_freqdist().keys()[:42]: print stem print_line() # Print the 42 most relevant words -- after tf*idf measure print "Top 42 words (most relevant): "