コード例 #1
0
    def __init__(self):
        #dict mapping competition to .bin
        self.com_to_bin = utils.mapping_competition_to_bin()
        
        #dict mapping competition to xml file
        self.com_to_xml = utils.mapping_competition_to_xml()
        
        #set settings to class attributes
        self.settings_to_class_attributes()
        
        #set ivar data
        self.data = defaultdict(lambda: defaultdict(list))

        
        #loop
        self.loop()
        
        #plot
        plot(self.data,
             self.data.keys(),
             'Average precision per %s' % self.feature,
             '%s' % self.feature,
             'precision',
             os.environ['output_path_pdf'],
             12)

        #plot barplot
        list_of_lists = []
        headers = ['POS', 'Competition (System)', 'Recall']

        for feature_value in ['a', 'n', 'r', 'v']:
            for competition in ['sval2', 'sval3', 'sval2007', 'sval2010', 'sval2013']:
                for (the_competition, system_name), answers in self.data[feature_value].iteritems():

                    if the_competition == competition:

                        recall = float(sum(answers)) / len(answers)
                        official_name = filename2official_name[system_name]

                        one_row = [feature_value, old2new[competition] + ' (%s)' % official_name, recall]
                        list_of_lists.append(one_row)

        plt.figure(figsize=(15, 8))
        df = pandas.DataFrame(list_of_lists, columns=headers)
        ax = seaborn.barplot(x='POS', y='Recall', hue='Competition (System)', data=df)
        ax.legend(loc=2, title='Competition (Top System overall $F_{1}$)', fontsize=11)
        ax.set_title('Recall per part of speech for each top ranked system', fontsize=16)

        matplotlib.rcParams.update({'font.size': 16})

        plt.xlabel('POS', fontsize=16)
        plt.ylabel('Recall', fontsize=16)
        plt.xticks(fontsize=16)
        plt.yticks(fontsize=16)

        output_path = os.environ['barplot_path_pdf']
        plt.savefig(output_path, bbox_inches='tight')

        print 'barplot saved to', output_path
コード例 #2
0
    def __init__(self):
        #set class attribute data
        self.context_levels = ['all', 'document', 'sentence', 'word']
        self.data = {
            context_level: {}
            for context_level in self.context_levels
        }
        for context_level in self.context_levels:
            self.data[context_level]['lemmas'] = {}

        #get dict competition to .bin and set class attributes of exp settings
        self.competitions = os.environ['competitions'].split("_")
        self.allowed_pos = os.environ['allowed_pos'].split('_')
        self.com_to_bin = {
            competition: path
            for competition, path in
            utils.mapping_competition_to_bin().iteritems()
            if competition in self.competitions
        }

        #update ivar data with .bin
        self.loop_bins()

        #plot + write information to file
        utils.plot_it(self.data, self.context_levels, self.competitions,
                      self.allowed_pos)

        #write stats to file
        for context_level in self.context_levels:
            output_file = os.path.join(os.environ['output_folder'],
                                       context_level + ".csv")
            stat_file = open(output_file + "stats.csv", "w")
            stats = defaultdict(int)
            with open(output_file, "w") as outfile:
                headers = "\t".join(
                    ['lemma', 'pos', 'num_instances', 'instances'])
                outfile.write(headers + "\n")
                for uri, info in self.data[context_level]['lemmas'].iteritems(
                ):
                    for lemma, d in info.iteritems():
                        not_one_sense, how_much = utils.one_sense_used(
                            d['instances'])
                        if all([d['num_instances'] >= 2, not_one_sense]):
                            stats[lemma] += how_much
                            output_line = [
                                lemma, d['pos'], d['num_instances']
                            ] + d['instances']
                            output_line = map(str, output_line)
                            outfile.write("\t".join(output_line) + "\n")

            stat_file.write("lemma\toccurences\n")
            for lemma, instances in sorted(stats.iteritems(),
                                           key=operator.itemgetter(1),
                                           reverse=True):
                stat_file.write("%s\t%s\n" % (lemma, instances))
            stat_file.close()
コード例 #3
0
    def __init__(self):
        #dict mapping competition to .bin
        self.com_to_bin = utils.mapping_competition_to_bin()

        #dict mapping competition to xml file
        self.com_to_xml = utils.mapping_competition_to_xml()

        #class attributes
        self.mfs = {}
        self.notmfs = {}
        self.labels = ['sval2', 'sval3', 'sval2007', 'sval2010', 'sval2013']

        #loop and write
        self.loop()
        self.plot_it()
コード例 #4
0
    def __init__(self):
        #dict mapping competition to .bin
        self.com_to_bin = utils.mapping_competition_to_bin()

        #dict mapping competition to xml file
        self.com_to_xml = utils.mapping_competition_to_xml()

        #set settings to class attributes
        self.settings_to_class_attributes()

        #loop and write
        stats, comp_stats = self.loop()

        print(comp_stats)

        self.compute_stats(stats, comp_stats)
コード例 #5
0
 def __init__(self):
     #dict mapping competition to .bin
     self.com_to_bin = utils.mapping_competition_to_bin()
     
     #dict mapping competition to xml file
     self.com_to_xml = utils.mapping_competition_to_xml()
     
     #set settings to class attributes
     self.settings_to_class_attributes()
     
     #loop
     self.loop()
     
     #plot
     outfile = open(os.environ['output_path_txt'],"w") 
     for counter,competition in enumerate(self.competitions):
         if competition in self.results:
             mistakes   = float(self.results[competition].count(0))
             error_rate = 100 * (  mistakes/len(self.results[competition])  )
             outfile.write("%s\t%s\n" % (competition,error_rate))
     outfile.close()  
コード例 #6
0
ファイル: stats_gs.py プロジェクト: cltl/WSD_error_analysis
#import general modules
import cPickle
import os

#modules in this cwd
import utils

#obtain exp settings from os.environ and load dict
competition = os.environ['competition']
output_path = os.environ['output_path']
com_to_bin = utils.mapping_competition_to_bin()
path_bin = com_to_bin[competition]
com, d = cPickle.load(open(path_bin))
output = []

#num instances
num_tokens = len(d)
output.append(("# instances", num_tokens))

#pos
pos = set(value['pos'] for value in d.itervalues() if value['pos'] != 'u')
pos = " ".join(pos)
output.append(("POS", pos))

#types
num_types = len(set(value['lemma'] for value in d.itervalues()))
output.append(("# lemmas", num_types))

#type token ratio
type_token_ratio = float(num_types) / float(num_tokens)
type_token_ratio = round(type_token_ratio, 2)