def bigdata_mse(request,input_dict,output_dict,widget): from discomll.utils import accuracy from disco.core import result_iterator import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_measures' tag = input_dict["predictions"] destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+'.txt' ensure_dir(destination) if input_dict["dataset"].params["id_index"] == -1: input_dict["string"] = "ID index should be defined." elif not os.path.isfile(destination): #file doesnt exists results = accuracy.measure(test_data = input_dict["dataset"], predictions = input_dict["predictions"], measure = "mse") string = "Mean squared error\n" for k, v in result_iterator(results): string += str(v) + "\n" input_dict["string"] = string f = open(destination,'w') f.write(str(v)) f.close() else: string = "Mean squared error\n" f = open(destination,'r') input_dict["string"] = string + str(f.readlines()[0]) f.close() return render(request, 'visualizations/display_string.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
def upload(self, request, pk=None): input = self.get_object() try: destination = settings.FILES_FOLDER + str( input.widget.workflow.id) + '/' + request.FILES['file'].name ensure_dir(destination) destination_file = open(destination, 'wb') for chunk in request.FILES['file'].chunks(): destination_file.write(chunk) destination_file.close() input.value = destination input.save() input.widget.unfinish() data = json.dumps({ 'status': 'ok', 'message': 'File successfully uploaded' }) except Exception as e: data = json.dumps({ 'status': 'error', 'message': 'Problem uploading file: {}'.format(str(e)) }) return HttpResponse(data, 'application/json')
def model_view(request, input_dict, output_dict, widget): from discomll.utils import model_view import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_models' tag_name = input_dict["fitmodel_url"] tag = input_dict["fitmodel_url"].values()[0] destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + '.txt' ensure_dir(destination) if not os.path.isfile(destination): #file doesnt exists model = model_view.output_model(tag_name) f = open(destination, 'w') f.write(model) f.close() filename = folder + "/" + tag[0][6:] + '.txt' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
def results_to_file(request,input_dict,output_dict,widget): from disco.core import result_iterator import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir tag = input_dict["string"] folder = 'discomll_results' add = "add" if input_dict["add_params"] == "true" else "" destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+add+'.txt' ensure_dir(destination) if not os.path.isfile(destination): #file doesnt exists f = open(destination,'w') if input_dict["add_params"] == "true": for k, v in result_iterator(tag): f.writelines(str(k) + " " + str(v) + "\n") else: for k, v in result_iterator(tag): f.writelines(str(k) + " " + str(v[0]) + "\n") f.close() filename = folder+"/"+tag[0][6:]+add+'.txt' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
def results_to_file(request, input_dict, output_dict, widget): from disco.core import result_iterator import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir tag = input_dict["string"] folder = 'discomll_results' add = "add" if input_dict["add_params"] == "true" else "" destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + add + '.txt' ensure_dir(destination) if not os.path.isfile(destination): #file doesnt exists f = open(destination, 'w') if input_dict["add_params"] == "true": for k, v in result_iterator(tag): f.writelines(str(k) + " " + str(v) + "\n") else: for k, v in result_iterator(tag): f.writelines(str(k) + " " + str(v[0]) + "\n") f.close() filename = folder + "/" + tag[0][6:] + add + '.txt' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
def model_view(request,input_dict,output_dict,widget): from discomll.utils import model_view import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_models' tag_name = input_dict["fitmodel_url"] tag = input_dict["fitmodel_url"].values()[0] destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+'.txt' ensure_dir(destination) if not os.path.isfile(destination): #file doesnt exists model = model_view.output_model(tag_name) f = open(destination,'w') f.write(model) f.close() filename = folder+"/"+tag[0][6:]+'.txt' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
def adc_to_csv(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir destination = MEDIA_ROOT + '/' + str(request.user.id) + '/' + str( widget.id) + '.csv' ensure_dir(destination) f = open(destination, 'w') adc = input_dict['adc'] ann = input_dict['ann'] df = defaultdict(list) for doc in adc.documents: for annotation in ann.split('\n'): annotation = annotation.strip() df[annotation].extend(doc.get_annotation_texts(annotation)) df = pd.DataFrame(df, columns=df.keys()) df.to_csv(destination, sep='\t', encoding='utf-8') filename = str(request.user.id) + '/' + str(widget.id) + '.csv' output_dict['filename'] = filename return render(request, 'visualizations/adc_to_csv.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
def bigdata_ca(request, input_dict, output_dict, widget): from discomll.utils import accuracy import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_measures' tag = input_dict["predictions"] destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + '.txt' ensure_dir(destination) if input_dict["dataset"].params["id_index"] == -1: input_dict["string"] = "ID index should be defined." elif not os.path.isfile(destination): # file doesnt exists measure, acc = accuracy.measure(test_data=input_dict["dataset"], predictions=input_dict["predictions"], measure="ca") string = "Classification Accuracy \n" score = str(measure) + " " + str(acc) + "\n" string += score input_dict["string"] = string f = open(destination, 'w') f.write(score) f.close() else: #ca results are cached string = "Classification Accuracy \n" f = open(destination, 'r') input_dict["string"] = string + str(f.readlines()[0]) f.close() return render(request, 'visualizations/display_string.html', {'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict})
def scikitAlgorithms_displayDecisionTree(request, input_dict, output_dict, widget): """Visualization displaying a decision tree""" import subprocess from sklearn import tree from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir # dot_data = StringIO.StringIO() filename = '/'.join( [str(request.user.id), 'decisionTree-scikit-%d.dot' % widget.id]) destination_dot = '/'.join([MEDIA_ROOT, filename]) ensure_dir(destination_dot) tree.export_graphviz(input_dict['classifier'], out_file=destination_dot) filename = '/'.join( [str(request.user.id), 'decisionTree-scikit-%d.png' % widget.id]) destination_img = '/'.join([MEDIA_ROOT, filename]) ensure_dir(destination_img) subprocess.call("dot -Tpng %s -o %s" % (destination_dot, destination_img), shell=True) return render( request, 'visualizations/scikitAlgorithms_display_decision_tree.html', { 'filename': filename, 'widget': widget, 'input_dict': input_dict })
def odt_to_tab(request,input_dict,output_dict,widget): import Orange from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir destination = MEDIA_ROOT+'/'+str(request.user.id)+'/'+str(widget.id)+'.tab' ensure_dir(destination) input_dict['data'].save(destination) filename = str(request.user.id)+'/'+str(widget.id)+'.tab' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
def MUSE_string_to_file_V3(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir basename = '/'.join([str(request.user.id), str(widget.id) + str(input_dict['fending'])]) destination = '/'.join([MEDIA_ROOT, basename]) ensure_dir(destination) with open(destination, 'w') as f: f.write(str(input_dict['data'])) return render(request, 'visualizations/MUSE_string_to_file_v3.html', {'widget': widget, 'fileURL': basename})
def string_to_file(request,input_dict,output_dict,widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir destination = MEDIA_ROOT+'/'+str(request.user.id)+'/'+str(widget.id)+'.txt' ensure_dir(destination) f = open(destination,'w') f.write(str(input_dict['string'])) f.close() filename = str(request.user.id)+'/'+str(widget.id)+'.txt' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
def bio3graph_biomine_visualizer(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir filename = os.path.join(str(request.user.id), str(widget.id) + '.bmg') destination = os.path.join(MEDIA_ROOT, filename) ensure_dir(destination) f = open(destination, 'w') f.write(str(input_dict['biomine_graph'])) f.close() return render(request, 'visualizations/bio3graph_biomine_visualizer.html', {'widget': widget, 'filename': filename})
def MUSE_view_xml(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir filename = os.path.join(str(request.user.id), str(widget.id) + '.xml') destination = os.path.join(MEDIA_ROOT, filename) ensure_dir(destination) f = open(destination, 'w') f.write(str(input_dict['xml_data'])) f.close() return render(request, 'visualizations/MUSE_view_xml.html', {'widget': widget, 'filename': filename})
def segmine_biomine_visualizer(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir filename = os.path.join(str(request.user.id), str(widget.id) + '.bmg') destination = os.path.join(MEDIA_ROOT, filename) ensure_dir(destination) f = open(destination, 'w') f.write(str(input_dict['graph'])) f.close() return render(request, 'visualizations/segmine_biomine_visualizer.html', { 'widget': widget, 'filename': filename })
def odt_to_arff(request, input_dict, output_dict, widget): import Orange from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir destination = MEDIA_ROOT + "/" + str(request.user.id) + "/" + str(widget.id) + ".arff" ensure_dir(destination) input_dict["data"].save(destination) filename = str(request.user.id) + "/" + str(widget.id) + ".arff" output_dict["filename"] = filename return render( request, "visualizations/string_to_file.html", {"widget": widget, "input_dict": input_dict, "output_dict": output_dict}, )
def MUSE_view_xml(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir filename = '/'.join([str(request.user.id), str(widget.id) + '.xml']) destination = '/'.join([MEDIA_ROOT, filename]) ensure_dir(destination) f = open(destination, 'w') f.write(str(input_dict['xml_data'])) f.close() return render(request, 'visualizations/MUSE_view_xml.html', { 'widget': widget, 'filename': filename })
def odt_to_arff(request, input_dict, output_dict, widget): import Orange from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir destination = MEDIA_ROOT + '/' + str(request.user.id) + '/' + str( widget.id) + '.arff' ensure_dir(destination) input_dict['data'].save(destination) filename = str(request.user.id) + '/' + str(widget.id) + '.arff' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
def MUSE_virtual_environment_visualization(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir filename = os.path.join(str(request.user.id), str(widget.id) + '.txt') destination = os.path.join(MEDIA_ROOT, filename) ensure_dir(destination) f = open(destination, 'w') f.write(str(input_dict['NLP_data'])) f.close() return render(request, 'visualizations/MUSE_view_3D_environment.html', {'widget': widget, 'filename': filename, 'unitylink': input_dict['unitylink'] })
def MUSE_string_to_file(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir basename = '/'.join( [str(request.user.id), str(widget.id) + str(input_dict['fending'])]) destination = '/'.join([MEDIA_ROOT, basename]) ensure_dir(destination) with open(destination, 'w') as f: f.write(str(input_dict['data'])) return render(request, 'visualizations/MUSE_string_to_file.html', { 'widget': widget, 'fileURL': basename })
def corpus_to_csv(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir destination = MEDIA_ROOT + '/' + str(request.user.id) + '/' + str( widget.id) + '.csv' ensure_dir(destination) df = input_dict['df'] df.to_csv(destination, encoding='utf-8', sep=';', index=False) filename = str(request.user.id) + '/' + str(widget.id) + '.csv' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
def MUSE_virtual_environment_visualization(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir filename = os.path.join(str(request.user.id), str(widget.id) + '.txt') destination = os.path.join(MEDIA_ROOT, filename) ensure_dir(destination) f = open(destination, 'w') f.write(str(input_dict['NLP_data'])) f.close() return render( request, 'visualizations/MUSE_view_3D_environment.html', { 'widget': widget, 'filename': filename, 'unitylink': input_dict['unitylink'] })
def string_to_file(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir destination = MEDIA_ROOT + "/" + str(request.user.id) + "/" + str(widget.id) + ".txt" ensure_dir(destination) f = open(destination, "w") f.write(str(input_dict["string"])) f.close() filename = str(request.user.id) + "/" + str(widget.id) + ".txt" output_dict["filename"] = filename return render( request, "visualizations/string_to_file.html", {"widget": widget, "input_dict": input_dict, "output_dict": output_dict}, )
def string_to_file(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir destination = MEDIA_ROOT + '/' + str(request.user.id) + '/' + str( widget.id) + '.txt' ensure_dir(destination) f = open(destination, 'w') f.write(str(input_dict['string'])) f.close() filename = str(request.user.id) + '/' + str(widget.id) + '.txt' output_dict['filename'] = filename return render(request, 'visualizations/string_to_file.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
def upload(self, request, pk=None): input = self.get_object() try: destination = FILES_FOLDER + str(input.widget.workflow.id) + '/' + request.FILES['file'].name ensure_dir(destination) destination_file = open(destination, 'wb') for chunk in request.FILES['file'].chunks(): destination_file.write(chunk) destination_file.close() input.value = destination input.save() input.widget.unfinish() data = json.dumps( {'status': 'ok', 'message': 'File successfully uploaded'}) except Exception, e: data = json.dumps( {'status': 'error', 'message': 'Problem uploading file: {}'.format(str(e))})
def weka_local_display_decision_tree(request, input_dict, output_dict, widget): """Visualization displaying a decision tree""" import subprocess from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir if not jp.isThreadAttachedToJVM(): jp.attachThreadToJVM() img_type = 'svg' if input_dict['img_type'] == 'raster': img_type = 'png' classifier = common.deserialize_weka_object(input_dict['classifier']) dot_text = classifier.graph() filename = '/'.join( [str(request.user.id), 'decisionTree-weka-%d.dot' % widget.id]) destination_dot = '/'.join([MEDIA_ROOT, filename]) ensure_dir(destination_dot) with open(destination_dot, 'w') as dot_file: dot_file.write(dot_text) # png/svg file filename = '/'.join([ str(request.user.id), 'decisionTree-weka-%d.%s' % (widget.id, img_type) ]) destination_img = '/'.join([MEDIA_ROOT, filename]) ensure_dir(destination_img) subprocess.call("dot -T%s %s -o %s" % (img_type, destination_dot, destination_img), shell=True) return render(request, 'visualizations/weka_local_display_decision_tree.html', { 'filename': filename, 'widget': widget, 'input_dict': input_dict })
def weka_local_display_decision_tree(request, input_dict, output_dict, widget): """Visualization displaying a decision tree""" import subprocess from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir if not jp.isThreadAttachedToJVM(): jp.attachThreadToJVM() img_type = 'svg' if input_dict['img_type'] == 'raster': img_type = 'png' classifier = common.deserialize_weka_object(input_dict['classifier']) dot_text = classifier.graph() filename = '/'.join([str(request.user.id), 'decisionTree-weka-%d.dot' % widget.id]) destination_dot = '/'.join([MEDIA_ROOT, filename]) ensure_dir(destination_dot) with open(destination_dot, 'w') as dot_file: dot_file.write(dot_text) # png/svg file filename = '/'.join([str(request.user.id), 'decisionTree-weka-%d.%s' % (widget.id, img_type) ]) destination_img = '/'.join([MEDIA_ROOT, filename]) ensure_dir(destination_img) subprocess.call("dot -T%s %s -o %s" % (img_type, destination_dot, destination_img), shell=True) return render(request, 'visualizations/weka_local_display_decision_tree.html', {'filename': filename, 'widget': widget, 'input_dict': input_dict})
def bigdata_ca(request, input_dict, output_dict, widget): from discomll.utils import accuracy import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_measures' tag = input_dict["predictions"] destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + '.txt' ensure_dir(destination) if input_dict["dataset"].params["id_index"] == -1: input_dict["string"] = "ID index should be defined." elif not os.path.isfile(destination): # file doesnt exists measure, acc = accuracy.measure(test_data=input_dict["dataset"], predictions=input_dict["predictions"], measure="ca") string = "Classification Accuracy \n" score = str(measure) + " " + str(acc) + "\n" string += score input_dict["string"] = string f = open(destination, 'w') f.write(score) f.close() else: #ca results are cached string = "Classification Accuracy \n" f = open(destination, 'r') input_dict["string"] = string + str(f.readlines()[0]) f.close() return render(request, 'visualizations/display_string.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
def bigdata_mse(request, input_dict, output_dict, widget): from discomll.utils import accuracy from disco.core import result_iterator import os.path from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir folder = 'discomll_measures' tag = input_dict["predictions"] destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + '.txt' ensure_dir(destination) if input_dict["dataset"].params["id_index"] == -1: input_dict["string"] = "ID index should be defined." elif not os.path.isfile(destination): #file doesnt exists results = accuracy.measure(test_data=input_dict["dataset"], predictions=input_dict["predictions"], measure="mse") string = "Mean squared error\n" for k, v in result_iterator(results): string += str(v) + "\n" input_dict["string"] = string f = open(destination, 'w') f.write(str(v)) f.close() else: string = "Mean squared error\n" f = open(destination, 'r') input_dict["string"] = string + str(f.readlines()[0]) f.close() return render(request, 'visualizations/display_string.html', { 'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict })
def cfrm_display_rrfile(request, input_dict, output_dict, widget): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir output_content_all = input_dict['redescriptions'] files = [] count = 1 for output_content in output_content_all: if output_content is None: output_content = 'Results missing :(' filename = os.path.join( str(request.user.id), 'redescriptions_{w_id}_{itCount}.rr'.format(w_id=widget.id, itCount=count)) count = count + 1 files.append(filename) destination_rr = os.path.join(MEDIA_ROOT, filename) ensure_dir(destination_rr) with open(destination_rr, 'w') as f: f.write(output_content) #print 'filenames' #print files return render( request, 'visualizations/cfrm_display_rrfile.html', { 'files': files, #'content': "<br />".join(output_content.split("\n")), 'contents': output_content_all, 'random': int(random.random() * 10000000), 'widget': widget, 'input_dict': input_dict })
def scikitAlgorithms_displayDecisionTree(request, input_dict, output_dict, widget): """Visualization displaying a decision tree""" import subprocess from sklearn import tree from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir # dot_data = StringIO.StringIO() filename = '/'.join([str(request.user.id), 'decisionTree-scikit-%d.dot' % widget.id]) destination_dot = '/'.join([MEDIA_ROOT, filename]) ensure_dir(destination_dot) tree.export_graphviz(input_dict['classifier'], out_file=destination_dot) filename = '/'.join([str(request.user.id), 'decisionTree-scikit-%d.png' % widget.id]) destination_img = '/'.join([MEDIA_ROOT, filename]) ensure_dir(destination_img) subprocess.call("dot -Tpng %s -o %s" % (destination_dot, destination_img), shell=True) return render(request, 'visualizations/scikitAlgorithms_display_decision_tree.html', {'filename': filename, 'widget': widget, 'input_dict': input_dict})
def clus_display_svg(request, input_dict, output_dict, widget): """Visualization displaying a decision tree""" import subprocess from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir img_type = 'svg' if input_dict['img_type'] == 'raster': img_type = 'png' dot_text = """digraph J48Tree { N0 [label="f8" ] N0->N1 [label="= +"] N1 [label="f99" ] N1->N2 [label="= +"] N2 [label="east (10.0/1.0)" shape=box style=filled ] N1->N3 [label="= -"] N3 [label="west (3.0/1.0)" shape=box style=filled ] N0->N4 [label="= -"] N4 [label="west (7.0)" shape=box style=filled ] }""" if type(input_dict['classifier']) == list: dot_text = "" starting_id = 0 for cls in input_dict['classifier']: dot_representation, starting_id = clus_tree_to_dot( cls['representation'], starting_id) dot_text += dot_representation + "\n" # dot_text = dot_text + "digraph " + cls['name'] + " {\n" + \ # dot_representation + "}\n\n" dot_text = "digraph Tree {\n" + dot_text + "}" else: dot_text = "digraph Tree {\n" + clus_tree_to_dot( input_dict['classifier'], 0)[0] + "}" filename = '/'.join( [str(request.user.id), 'decisionTree-clus-%d.dot' % widget.id]) dotfile = filename destination_dot = '/'.join([MEDIA_ROOT, filename]) ensure_dir(destination_dot) with open(destination_dot, 'w') as dot_file: dot_file.write(dot_text) # png/svg file filename = '/'.join([ str(request.user.id), 'decisionTree-clus-%d.%s' % (widget.id, img_type) ]) destination_img = os.path.join(MEDIA_ROOT, filename) ensure_dir(destination_img) try: dot_path = settings.DOT_PATH except: dot_path = 'dot' subprocess.call(dot_path + " -T%s %s -o %s" % (img_type, destination_dot, destination_img), shell=True) return render( request, 'visualizations/cf_clus_display_svg_tree.html', { 'filename': filename, 'dotfile': dotfile, 'random': int(random() * 10000000), 'widget': widget, 'input_dict': input_dict })
def display_corpus_statistic(request, input_dict, output_dict, widget, narrow_doc='n'): from mothra.settings import MEDIA_ROOT from workflows.helpers import ensure_dir corpus = input_dict['corpus'] stat_type = input_dict['stat_type'] allAnnotations = 0 result_list = [] n = int(input_dict['n_gram']) #get some general stats general_stats = {} general_stats['num_doc'] = len(corpus) doc_lengths = [] all_tokens = set() for doc in corpus: try: doc = doc.split() except: doc = str(doc).split() doc_lengths.append(len(doc)) for tok in doc: all_tokens.add(tok) general_stats['num_tokens'] = sum(doc_lengths) if general_stats['num_doc'] > 0: general_stats['avg_doc_length'] = float( general_stats['num_tokens']) / general_stats['num_doc'] else: general_stats['avg_doc_length'] = 0 if general_stats['num_tokens'] > 0: general_stats['ttr'] = len(all_tokens) / float( general_stats['num_tokens']) else: general_stats['ttr'] = 0 if stat_type == 'frequency' or stat_type == 'dis_legomena' or stat_type == 'hapax_legomena': annotation_dict = {} for doc in corpus: try: doc.split() except: doc = str(doc) if doc.count('###') > 3: annotations = doc.split('###') else: annotations = doc.split() length = len(annotations) for i in range(0, length - n + 1): combo = "" for j in range(i, i + n): value = annotations[j] if j > i: combo += " " combo += value if len(combo) > 0: allAnnotations += 1 if combo in annotation_dict: annotation_dict[combo] = annotation_dict[combo] + 1 else: annotation_dict[combo] = 1 title = "N-gram" measure = 'Frequency' if stat_type == 'frequency': allAnnotations = float(allAnnotations) for pos, number in annotation_dict.items(): try: pos = pos.encode('utf8') result_list.append( (pos, number, "{0:.4f}".format(float(number) / allAnnotations))) except: continue result_list = sorted(result_list, key=lambda x: x[1], reverse=True) if len(result_list) > 100: result_list = result_list[:100] else: allAnnotations = float(allAnnotations) for pos, number in annotation_dict.items(): if stat_type == 'dis_legomena': if number == 2: pos = pos.encode('utf8') result_list.append( (pos, number, "{0:.4f}".format(float(number) / allAnnotations))) else: if number == 1: pos = pos.encode('utf8') result_list.append( (pos, number, "{0:.4f}".format(float(number) / allAnnotations))) if len(result_list) > 300: result_list = result_list[:300] else: all_annotations = [] for doc in corpus: if doc.count('###') > 3: annotations = doc.split('###') else: annotations = doc.split() all_annotations.extend(annotations) if stat_type == 'pmi_bigrams': bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(all_annotations) best = sorted(finder.score_ngrams(bigram_measures.pmi), key=lambda x: x[1], reverse=True) if len(best) > 100: best = best[:100] for tags, score in best: tag1, tag2 = tags result_list.append( (tag1 + "\t" + tag2, "{0:.4f}".format(score))) title = "Bigram collocations" elif stat_type == 'pmi_trigrams': trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = TrigramCollocationFinder.from_words(all_annotations) best = sorted(finder.score_ngrams(trigram_measures.pmi), key=lambda x: x[1], reverse=True) if len(best) > 100: best = best[:100] for tags, score in best: tag1, tag2, tag3 = tags result_list.append( (tag1 + " " + tag2 + " " + tag3, "{0:.4f}".format(score))) title = "Trigram collocations" measure = 'PMI score' if title == 'N-gram': columns = ['N-gram', 'Raw frequency', 'Frequency'] df = pd.DataFrame(result_list, columns=columns) if title != 'N-gram': columns = [title, measure] df = pd.DataFrame(result_list, columns=columns) destination = MEDIA_ROOT + '/' + str(request.user.id) + '/' + str( widget.id) + '.csv' ensure_dir(destination) df.to_csv(destination, encoding='utf-8', sep=';', index=False) filename = str(request.user.id) + '/' + str(widget.id) + '.csv' output_dict['filename'] = filename return render( request, 'visualizations/corpus_statistics.html', { 'widget': widget, 'data': [result_list, title, measure, general_stats], 'narrow_doc': narrow_doc, 'output_dict': output_dict })