def GatherTexts(user_source_dir, corpus_source_dir, remove_old_plaintext = False): # Make sure passed in paths are formatted correctly user_source_dir = Utils_MalletInterpret.FormatPath(user_source_dir) corpus_source_dir = Utils_MalletInterpret.FormatPath(corpus_source_dir) # Copy files from user source directory to TWiC's corpus source directory source_file_counter = 1 for user_filename in glob.glob(user_source_dir + "*.txt"): user_text = TWiC_Text(user_filename) #user_text.ConvertToPlainText("{0}{1}_{2}{3}".format(corpus_source_dir,\ # source_file_counter,\ # user_text.GetFilename(),\ # user_text.GetFileExtension())) #source_file_counter += 1 source_file_counter += user_text.ConvertToPlainText_Chunks(corpus_source_dir, source_file_counter)
def Build_TextObjects(TextClass, mallet_script, tp_collection): textobj_collection = [] for current_tp in tp_collection: filename = Utils_MalletInterpret.GetFilename(current_tp.filename) textobj_collection.append(TextClass('{0}{1}.tei'.format(mallet_script.tei_source, filename))) return textobj_collection
def ConvertToPlainText_Chunks(self, p_output_dir, p_file_number, p_chunk=True, p_chunk_size=5000): file_name = self.GetFilename() file_ext = self.GetFileExtension() output_lines = self.GetPreparedLines() # Optional line chunking chunks = [] if p_chunk: chunks = Utils_MalletInterpret.GetChunkedLines(output_lines, p_chunk_size) else: chunks.append(output_lines) # Write out files for index in range(len(chunks)): with open("{0}{1}_{2}_{3}{4}".format(p_output_dir, p_file_number, file_name, index, file_ext), 'w') as plaintext_output_file: for line in chunks[index]: plaintext_output_file.write(unidecode(line) + u"\n") p_file_number += 1 return len(chunks)
def InterpretMalletOutput(mallet_script): print "Interpreting MALLET output for TWiC visualization..." myoutput_dir = os.path.join("..", "..", "..", "data", "input" + os.sep) print "\tReading in MALLET output..." ####### 1. Reading corpus.topics.tsv print "\t\tLoading topics for texts..." # tp_collection = mallet_script.GetTopicsFileData("2.0.7") tp_collection = mallet_script.GetTopicsFileData("2.0.9") ###### 2. Reading corpus.keys.tsv print "\t\tLoading topic keys..." topic_keys = mallet_script.GetKeysFileData() ###### 3. Reading corpus.topic-state.tsv print "\t\tLoading topic words state file..." fwt_collection = mallet_script.GetStateFileData() ###### 4. Reading corpus.wordweights.tsv print "\t\tLoading topic word weights..." ww_table = mallet_script.GetTopicWordWeights() ###### 5. Build a text object for each text print "\tBuilding text objects..." # start_time = int(round(time.time() * 1000)) # textobj_collection = TWiC_MalletInterpret.Build_TextObjects(TWiC_Poem, mallet_script, tp_collection) # end_time = int(round(time.time() * 1000)) # print "Unoptimized: {0}ms".format(end_time - start_time) # start_time = int(round(time.time() * 1000)) # text_obj_collection_opt = TWiC_MalletInterpret.Build_TextObjects_Opt(TWiC_Poem, mallet_script, tp_collection) textobj_collection = TWiC_MalletInterpret.Build_TextObjects_Opt(TWiC_Text, mallet_script, tp_collection) # end_time = int(round(time.time() * 1000)) # print "Optimized: {0}ms".format(end_time - start_time) ###### 6. Generate a list of unique colors for each topic print "\tCreating color list..." color_list = Utils_Color.Get_UniqueColorList(len(topic_keys.corpus_topic_proportions.keys())) # color_list = Utils_Color.HCL_Fluo ###### 7. Build HTML and JSON files for each text for low and mid level TWiC representations print "\tCreating JSON files for TWiC views of individual texts..." for text in textobj_collection: current_tp = None for tp in tp_collection: if text.GetFilename().split("_")[0] == Utils_MalletInterpret.GetFilenameWithUnderscore(tp.filename): current_tp = tp break # TWiC_MalletInterpret.Build_HTMLandJSONForText(text, myoutput_dir, "{0}.css".format(mallet_script.corpus_name), \ # current_tp, fwt_collection, topic_keys, color_list, mallet_script, True) TWiC_MalletInterpret.Build_JSONForTextwithForeignObject(text, myoutput_dir, "{0}.css".format(mallet_script.corpus_name), \ current_tp, fwt_collection, topic_keys, color_list, mallet_script, True) ###### 8. Build JSON files for visualization print "\tBuilding corpus-level JSON map files..." # Build a json that shows the hierarchy of Corpus -> Text clusters -> Texts based on Jensen-Shannon Distance TWiC_MalletInterpret.Build_CorpusMapJSON_Avg(mallet_script.corpus_title, topic_keys.corpus_topic_proportions, tp_collection, myoutput_dir + "json" + os.sep) # Output a JSON of the topic-color list # TWiC_MalletInterpret.Build_TopicColorMapJSON(color_list, myoutput_dir + "json/") # Generate topic list JSON based on the used_topics_list # TWiC_MalletInterpret.Build_TopicWordsJSON(topic_keys, myoutput_dir + "json/") # New JSON format for client side TWiC_MalletInterpret.Build_CorpusInfoJSON(mallet_script.corpus_title, textobj_collection, tp_collection, topic_keys, color_list, myoutput_dir + "json" + os.sep) # Build a json that lists the distribution weights of words likely to appear in each topic TWiC_MalletInterpret.Build_WordWeightJSON(ww_table, myoutput_dir + "json" + os.sep) print "Finished processing {0} for TWiC.".format(mallet_script.corpus_title)
def Build_CorpusInfoJSON(corpus_title, text_collection, tp_collection, topic_keys, color_list, output_dir): # Output JSON format # { # "topic_info" : [ # indexed by int topic ID number # [ # ["habit", "dash", "torn",...], # topic words # 022440" # hex color # ],... # ], # "corpus_info" : [ # "Corpus Title", # [0.5, 0.2, ...] # topic proportions [topic0,...topicN] # ], # "file_info" : { # "0" : [ # indexed by str file ID number # "Filename", # "Text Title", # [0.5, 0.2, ...], # topic proportions [topic0,...topicN] # 3, # stanza count # 65, # line count # 400 # word count # ],... # } # } # Indexers Info # "topic_info" (indexed by int topic ID) topic_info = "topic_info" TI_TopicWords = 0 TI_Color = 1 # "corpus_info" corpus_info = "corpus_info" CI_CorpusTitle = 0 CI_TopicProportions = 1 # "file_info" (indexed by str numeric file ID) file_info = "file_info" FI_Filename = 0 FI_TextTitle = 1 FI_TopicProportions = 2 FI_StanzaCount = 3 FI_LineCount = 4 FI_WordCount = 5 FI_FieldCount = 6 json_output = { topic_info : [ ], corpus_info : ["", []], file_info : { } } topic_count = len(topic_keys.corpus_topic_proportions.keys()) # Fill out topic_info and corpus_info json_output[corpus_info][CI_CorpusTitle] = corpus_title json_output[corpus_info][CI_TopicProportions] = [topic_keys.corpus_topic_proportions[str(topic_index)] for topic_index in range(topic_count)] json_output[topic_info] = [[topic_keys.corpus_topic_words[str(topic_index)] for topic_index in range(topic_count)], [color_list[topic_index] for topic_index in range(topic_count)]] # print "COLOR LIST COMPLETION" # print [color_list[topic_index] for topic_index in range(topic_count)] # Fill out file_info for tp in tp_collection: json_output[file_info][tp.fileid] = [] json_output[file_info][tp.fileid] = [0 for index in range(FI_FieldCount)] # json_output[file_info][tp.fileid][FI_Filename] = tp.filename json_output[file_info][tp.fileid][FI_Filename] = Utils_MalletInterpret.GetFilename(tp.filename) json_output[file_info][tp.fileid][FI_TopicProportions] = [] for topic_index in range(0, topic_count): json_output[file_info][tp.fileid][FI_TopicProportions].append(tp.topic_guide[str(topic_index)]) for text in text_collection: text_filename = text.GetFilename() for fileid in json_output[file_info].keys(): if text_filename == json_output[file_info][fileid][FI_Filename]: json_output[file_info][fileid][FI_TextTitle] = text.GetTitle() # Still have to fill out # FI_StanzaCount = 3 # FI_LineCount = 4 # FI_WordCount = 5 # Output JSON with open(output_dir + "twic_corpusinfo.json", "w") as output_file: output_file.write(json.dumps(json_output))
def Build_HTMLandJSONForText(text, output_dir, css_filename, current_tp, fwt_collection, topic_keys, color_list, mallet_script, split_filename=False): file_id = text.GetFilename() if split_filename: file_id = text.GetFilename().split("_")[0] output_html = open(output_dir + "html" + os.sep + file_id + '.html', 'w') output_html.write('<html>\n') output_html.write('\t<head>\n') output_html.write('\t\t<link rel="stylesheet" type="text/css" href="{0}">\n'.format(css_filename)) output_html.write('\t</head>\n') output_html.write('\t<body>\n') output_html.write('\t\t<div class="left">\n') output_html.write('\t\t\t<div class="title">\n') output_html.write('\t\t\t\t{0}<br>\n'.format(text.GetTitle())) output_html.write('\t\t\t</div>\n') output_html.write('\t\t\t{0}<br>\n'.format(text.GetPublication())) output_html.write('\t\t</div>\n') output_html.write('\t\t<div class="center">\n') # Create a JSON for each text (for mid-level twic, text tile) # Figure out the possible topics for each word based on the topic state file current_fwt = None for fwt in fwt_collection: fwt_file_id = Utils_MalletInterpret.GetFilename(fwt.GetFilename()) if split_filename: fwt_file_id = Utils_MalletInterpret.GetFilenameWithUnderscore(fwt.GetFilename()) if fwt_file_id == file_id: current_fwt = fwt break # Convert text to JSON readable by the high-level TWiC visualization TWiC_MalletInterpret.ConvertTextToJSON(text, output_dir + "json" + os.sep + "texts" + os.sep, mallet_script, current_fwt) # Read in the plain text file input_file = open(current_tp.filename, 'r') data = input_file.readlines() input_file.close() # If there was no state file entry, output HTML lines without topics used_topics_list = [] if None == current_fwt: for line in data: output_line = '' words = line.split(' ') for actual_word_index in range(0, len(words)): output_line += words[actual_word_index] + ' ' output_line = output_line.strip() output_html.write('\t\t\t' + output_line + '<br>\n') else: statefile_word_index = 0 for line in data: output_line = '' words = line.split(' ') if statefile_word_index < len(current_fwt.word_info): lowercase_state_word = clean_word(current_fwt.word_info[statefile_word_index].word.lower()) # Go through each word in the line for actual_word_index in range(0, len(words)): # Lowercase only for comparison lowercase_word = clean_word(words[actual_word_index].lower()) if statefile_word_index < len(current_fwt.word_info) and \ lowercase_word == lowercase_state_word: output_line += '<span title="Topic {0}"><font color="{1}"><b>{2}</b></font></span>'.format(current_fwt.word_info[statefile_word_index].topic, color_list[int(current_fwt.word_info[statefile_word_index].topic)], words[actual_word_index]) if current_fwt.word_info[statefile_word_index].topic not in used_topics_list: used_topics_list.append(current_fwt.word_info[statefile_word_index].topic) statefile_word_index += 1 if statefile_word_index < len(current_fwt.word_info): lowercase_state_word = clean_word(current_fwt.word_info[statefile_word_index].word.lower()) else: output_line += words[actual_word_index] output_line += ' ' output_line = output_line.strip() output_html.write('\t\t\t' + output_line + '<br>\n') output_html.write('\t\t</div><br><br>\n') output_html.write('\t\t<div class="topics">\n') for used_topic in used_topics_list: output_html.write('\t\t\t<font color="{0}">Topic {1}: {2}</font><br>\n'.format(color_list[int(used_topic)], used_topic, topic_keys.corpus_topic_words[used_topic])) output_html.write('\t\t</div>\n') output_html.write('\t</body>\n') output_html.write('</html>') output_html.close()
def Build_JSONForTextwithForeignObject(text, output_dir, css_filename, current_tp, fwt_collection, topic_keys, color_list, mallet_script, split_filename=False): file_id = text.GetFilename() if split_filename: file_id = text.GetFilename().split("_")[0] # Figure out the possible topics for each word based on the topic state file current_fwt = None for fwt in fwt_collection: fwt_file_id = Utils_MalletInterpret.GetFilename(fwt.GetFilename()) if split_filename: fwt_file_id = Utils_MalletInterpret.GetFilenameWithUnderscore(fwt.GetFilename()) if fwt_file_id == file_id: current_fwt = fwt break # Retrieve json data of a line-word-topic map from ConvertTextToJSON json_data = TWiC_MalletInterpret.ConvertTextToJSON(text, output_dir + "json" + os.sep + "texts" + os.sep, mallet_script, current_fwt, False) if "No state file data" == json_data: print "Warning: No state file data for {0}".format(text.GetFilename()) return # Output text will be partial html to be inserted inside foreignObject tag output_text = [] # Add an initial spacing span between the panel's control bar and the body output_text.append("<xhtml:p class=\"text_p\"><xhtml:span class=\"text_edgespan\"> </xhtml:span></xhtml:p>") # Build up HTML lines that will be inserted as a foreignObject client-side #print "FILENAME: {0}".format(text.GetFilename()) #print "L&C LEN: {0}".format(len(json_data["document"]["lines_and_colors"])) #for lc_index in range(len(json_data["document"]["lines_and_colors"])): # print json_data["document"]["lines_and_colors"][lc_index] for lc_index in range(len(json_data["document"]["lines_and_colors"])): entry = json_data["document"]["lines_and_colors"][lc_index] output_text.append("<xhtml:p class=\"text_p\">") output_text.append("<xhtml:span class=\"text_edgespan\"> </xhtml:span>") for index in range(len(entry[0])): if str(index) in entry[1]: output_text.append("<xhtml:span class=\"text_coloredword\" style=\"color:{0}\">{1} </xhtml:span>".format(\ color_list[int(entry[1][str(index)])], entry[0][index])) else: output_text.append("<xhtml:span class=\"text_word\">{0} </xhtml:span>".format(entry[0][index])) output_text.append("</xhtml:p>") #output_text.append("<xhtml:br></xhtml:br>") # Add the foreignObject HTML to the JSON json_data["document"]["full_text"] = ''.join([str(output_text_line) for output_text_line in output_text]) # Save the number of lines in the text for panel height size client-side json_data["document"]["line_count"] = len(json_data["document"]["lines_and_colors"]) # Dereference the lines and colors array for garbage collection #json_data["document"].pop("lines_and_colors", None) # Write the JSON file for this text with open(output_dir + "json" + os.sep + "texts" + os.sep + text.GetFilename() + ".json", 'w') as fileptr: #print "Writing {0}".format(output_dir + "json/texts/" + text.GetFilename() + ".json") fileptr.write(json.dumps(json_data))
def main(args): Utils_MalletInterpret.TimeAndRun(Corpus2Vis, args)
def InterpretMalletOutput(mallet_script): print 'Interpreting MALLET output for visualization...' # myoutput_dir = '/Users/PeregrinePickle/Documents/Programming/Corpora/Dickinson/output/myviz-output/' myoutput_dir = os.path.join("..", "..", "..", "data", "dickinson", "input" + os.sep) print '\tReading in MALLET output...' ####### 1. Reading dickinson.topics.tsv tp_collection = mallet_script.GetTopicsFileData("2.0.9") ###### 2. Reading dickinson.keys.tsv topic_keys = mallet_script.GetKeysFileData() ###### 3. Reading dickinson.topic-state.tsv fwt_collection = mallet_script.GetStateFileData() ###### 4. Reading dickinson.wordweights.tsv ww_table = mallet_script.GetTopicWordWeights() ###### 5. Build a text object for each text print '\tBuilding text objects...' # start_time = int(round(time.time() * 1000)) # textobj_collection = TWiC_MalletInterpret.Build_TextObjects(TWiC_Poem, mallet_script, tp_collection) # end_time = int(round(time.time() * 1000)) # print 'Unoptimized: {0}ms'.format(end_time - start_time) # start_time = int(round(time.time() * 1000)) # text_obj_collection_opt = TWiC_MalletInterpret.Build_TextObjects_Opt(TWiC_Poem, mallet_script, tp_collection) textobj_collection = TWiC_MalletInterpret.Build_TextObjects_Opt( TWiC_Poem, mallet_script, tp_collection) # end_time = int(round(time.time() * 1000)) # print 'Optimized: {0}ms'.format(end_time - start_time) ###### 6. Generate a list of unique colors for each topic print '\tCreating color list...' color_list = Utils_Color.Get_UniqueColorList( len(topic_keys.corpus_topic_proportions.keys())) ###### 7. Build HTML and JSON files for each text for low and mid level TWiC representations print '\tCreating JSON files for TWiC views of texts...' for text in textobj_collection: current_tp = None for tp in tp_collection: if text.GetFilename() == Utils_MalletInterpret.GetFilename( tp.filename): current_tp = tp break TWiC_MalletInterpret.Build_JSONForTextwithForeignObject(text, myoutput_dir, '{0}.css'.format(mallet_script.corpus_name), \ current_tp, fwt_collection, topic_keys, color_list, mallet_script) ###### 8. Build JSON files for visualization print '\tBuilding JSON map files for TWiC visualization...' # Build a json that shows the hierarchy of Corpus -> Text clusters -> Texts based on Jensen-Shannon Distance TWiC_MalletInterpret.Build_CorpusMapJSON_Avg( mallet_script.corpus_title, topic_keys.corpus_topic_proportions, tp_collection, myoutput_dir + "json" + os.sep) # Output a JSON of the topic-color list # TWiC_MalletInterpret.Build_TopicColorMapJSON(color_list, myoutput_dir + "json/") # Generate topic list JSON based on the used_topics_list # TWiC_MalletInterpret.Build_TopicWordsJSON(topic_keys, myoutput_dir + "json/") # New JSON format for client side TWiC_MalletInterpret.Build_CorpusInfoJSON( mallet_script.corpus_title, textobj_collection, tp_collection, topic_keys, color_list, myoutput_dir + "json" + os.sep) # Build a json that lists the distribution weights of words likely to appear in each topic TWiC_MalletInterpret.Build_WordWeightJSON( ww_table, myoutput_dir + "json" + os.sep)