def main(): """ This program makes a first exploration of all the input material we have, it prints out information like: - How big the input folder is (bytes) - How many raw text material is available (bytes), i. e. w/o meta-data - How many symbols are used - How many tokens, words, stems etc. are available TBD: - Add params to this program or make it more user-friendly / interactive. - Add more outcome, probably not only quantitative, but also qualitative information. - Put some of the (verbose) text into other classes. """ print_own_info(__file__) # Print total file size (=folder size) information of the # input material xmldocs = Collection() no_of_docs = len(xmldocs.get_docs()) print "-- Calculating total file size ..." print "Total file size: " + str(xmldocs.get_filesize()) + " bytes" print_line() # Print total raw text material information, being body text # of messages w/o meta-data rawsize = xmldocs.get_rawsize() print "-- Calculating raw size of text ..." print "Total raw size: " + str(rawsize) + " bytes" print "Avg raw size: " + str((rawsize / no_of_docs)) + " bytes" # Write all (body) text to a text file stdout.write("Write raw text into file: " + get_raw_file()) xmldocs.write_raw_text(in_one_file=True) print_ok() # - Write all unique symbols like "a", "ö" or "\", which are used, # into a file # - Give number of unique symbols employed stdout.write("Write symbols used into file: " + get_symbols_file()) syms = Symbols() syms.write_symbols() print_ok() print_line() print "-- Get unique symbols ..." print "Total number of unique symbols: " + str(syms.get_no_of_symbols()) print_line() # Print total numbers of tokens available; separation is done # by means of the Natural Language Toolkit (NLTK) # Problematic here: There are lots of non-linguistic tokens being # created, like URLs, at this stage. # That's why these tokens here are denoted as being "raw". print "-- Get tokens ..." tokenized_text = map(lambda x:x.lower(), xmldocs.get_tokens()) print "Total number of (raw) tokens: " + str(len(tokenized_text)) print "Avg number of (raw) tokens: " + \ str(len(tokenized_text)/no_of_docs) print_line() # - Print total number of unique tokens (=types); also here, lots # of "non-linguistic" types are preserved, ATM. # - Print also these raw types in lower case. print "-- Get types ..." typed_text = xmldocs.get_types() typed_text_lowered = xmldocs.get_types(lower=True) print "Total number of (raw) types: " + \ str(len(typed_text)) print "Total number of (raw) types (lower-cased): " + \ str(len(typed_text_lowered)) print "Avg number of (raw) types: " + \ str(len(typed_text)/no_of_docs) print "Avg number of (raw) types (lower-cased): " + \ str(len(typed_text_lowered)/no_of_docs) print_line() # - Print total number of words. These are "real" words; they # are very likely to be of linguistic nature, because they # were cleaned by the means of regexps -- constructed upon # observations made. print "-- Get number of words ..." words = xmldocs.get_words() words2 = set(words) print "Total number of words: " + \ str(len(words)) print "Total number of words2: " + \ str(len(words2)) print "Avg number of words: " + \ str(len(words)/no_of_docs) print_line() # - Get the subset of nouns from the words print "-- Get number of nouns ..." nouns = xmldocs.get_words(pos='n') print "Total number of nouns: " + \ str(len(nouns)) print "Avg number of nouns: " + \ str(len(nouns)/no_of_docs) print "Total number of (unique) nouns: " + \ str(len(set(nouns))) print "Avg number of (unique) nouns: " + \ str(len(set(nouns))/no_of_docs) print_line() # - Print total number of unique stems, which got created by NLTK # means, applied over words. print "-- Get number of stems ..." stemmed_text = xmldocs.get_stems() print "Total number of stems: " + \ str(len(stemmed_text)) print "Avg number of stems: " + \ str(len(stemmed_text)/no_of_docs) print_line() print "-- Get number of unique stems ..." stemmed_uniq_text = xmldocs.get_stems(uniq=True) print "Total number of unique stems: " + \ str(len(stemmed_uniq_text)) print "Avg number of stems: " + \ str(len(stemmed_text)/no_of_docs) print "Avg number of (unique) stems: " + \ str(len(stemmed_uniq_text)/no_of_docs) print_line() # Finally write some files, containing tokens, types, types in # lower case, words, stems and nouns. stdout.write("Write tokens into file: " + get_tokens_file()) xmldocs.write_tokens() print_ok() stdout.write("Write types into file: " + get_types_file()) xmldocs.write_types() print_ok() stdout.write("Write types (lowered) into file: " + get_types_file(lower=True)) xmldocs.write_types(lower=True) print_ok() stdout.write("Write words into file: " + get_words_file()) xmldocs.write_words() print_ok() stdout.write("Write stems (unique) into file: " + get_stems_file()) xmldocs.write_stems() print_ok() stdout.write("Write nouns into file: " + get_words_file(pos='n')) xmldocs.write_words(pos='n') print_ok() print_line() # Print the 42 most frequent words -- Zipf's law turns true ;-) print "Top 42 words (most frequent): " for stem in xmldocs.get_freqdist().keys()[:42]: print stem print_line() # Print the 42 most relevant words -- after tf*idf measure print "Top 42 words (most relevant): "
""" @author Hernani Marques <*****@*****.**>, 2012 """ from sys import stdout from wh4t.library import print_own_info from wh4t.documents import Collection from wh4t.symbols import Symbols from nltk.text import TextCollection print_own_info(__file__) # Create a symbols object to see which symbols are available, along with # their attributions symbols = Symbols() for sym, sym_class in symbols.items(): print sym + "\t" + str(sym_class) # Params for getting (similar) words by edit distance and number of # most frequent words (=top words) we are interested in. editdistance = 1 no_of_topwords = 420 xmlcollection = Collection() # For the above specified most frequent words search similar words (by # edit distance) in the whole collection. print "Finding forms for the top " + str(no_of_topwords) + \ " words by edit distance " + \ str(editdistance) + "; this may take a while!" xmlcollection.get_words_by_editdistance(editdistance=editdistance,