Ejemplo n.º 1
0
    def most_informative_features(self, n=100):
        """
        Return a list of the 'most informative' features used by this
        classifier.  For the purpose of this function, the
        informativeness of a feature C{(fname,fval)} is equal to the
        highest value of P(fname=fval|label), for any label, divided by
        the lowest value of P(fname=fval|label), for any label::

          max[ P(fname=fval|label1) / P(fname=fval|label2) ]
        """
        # The set of (fname, fval) pairs used by this classifier.
        features = set()
        # The max & min probability associated w/ each (fname, fval)
        # pair.  Maps (fname,fval) -> float.
        maxprob = defaultdict(lambda: 0.0)
        minprob = defaultdict(lambda: 1.0)

        for (label, fname), probdist in list(self._feature_probdist.items()):
            for fval in probdist.samples():
                feature = (fname, fval)
                features.add( feature )
                p = probdist.prob(fval)
                maxprob[feature] = max(p, maxprob[feature])
                minprob[feature] = min(p, minprob[feature])
                if minprob[feature] == 0:
                    features.discard(feature)

        # Convert features to a list, & sort it by how informative
        # features are.
        features = sorted(features, 
            key=lambda feature: minprob[feature]/maxprob[feature])
        return features[:n]
Ejemplo n.º 2
0
 def __init__(self):
   self.feature_weights = defaultdict(lambda: ('default',0))
   self.trained = False
   self.largest = defaultdict(lambda: ('default name', 'default value',0))
   self.stopset = set(stopwords.words('english'))
   self.my_feats = self.bigram_word_feats
   self.train(self.my_feats)
   #self.book_train(self.my_feats)
   self.calculate_weights()
   self.train(self.my_feats)
Ejemplo n.º 3
0
 def __init__(self):
     self.feature_weights = defaultdict(lambda: ('default', 0))
     self.trained = False
     self.largest = defaultdict(lambda:
                                ('default name', 'default value', 0))
     self.stopset = set(stopwords.words('english'))
     self.my_feats = self.bigram_word_feats
     self.train(self.my_feats)
     #self.book_train(self.my_feats)
     self.calculate_weights()
     self.train(self.my_feats)
Ejemplo n.º 4
0
  def train(self, feats):
    print "Starting to train the data"
    start = datetime.datetime.now()

    print "setting the ids", datetime.datetime.now()
    self.negids = movie_reviews.fileids('neg')
    self.posids = movie_reviews.fileids('pos')
    #random.shuffle(self.negids)
    #random.shuffle(self.posids)
    ##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] +
        ##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids])
    ##random.shuffle(self.reviews)

    ##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:])
    ##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4])

    print "setting the feats", datetime.datetime.now()
    self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg') for f in self.negids]
    self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos') for f in self.posids]

    self.negcutoff = len(self.negfeats)*3/4
    self.poscutoff = len(self.posfeats)*3/4

    print "setting the train/test", datetime.datetime.now()
    self.trainfeats = self.negfeats[:self.negcutoff] + self.posfeats[:self.poscutoff]
    self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[self.poscutoff:]

    print "training", datetime.datetime.now()
    self.classifier = NaiveBayesClassifier.train(self.trainfeats)
    ##self.classifier = NaiveBayesClassifier.train(self.train_set)
    self.refsets = defaultdict(set)
    self.testsets = defaultdict(set)

    print "accuracy stuff", datetime.datetime.now()
    for i, (feats, label) in enumerate(self.testfeats):
    ##for i, (feats, label) in enumerate(self.test_set):
      self.refsets[label].add(i)
      observed = self.classifier.classify(feats)
      self.testsets[observed].add(i)

    end = datetime.datetime.now()
    print "Training lasted for ", end-start


    print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.testfeats)
    ##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set)
    print 'pos precision:', nltk.metrics.precision(self.refsets['pos'], self.testsets['pos'])
    print 'pos recall:', nltk.metrics.recall(self.refsets['pos'], self.testsets['pos'])
    print 'neg precision:', nltk.metrics.precision(self.refsets['neg'], self.testsets['neg'])
    print 'neg recall:', nltk.metrics.recall(self.refsets['neg'], self.testsets['neg'])
    self.classifier.show_most_informative_features()
    self.trained = True
Ejemplo n.º 5
0
    def __init__(self, tokens, key=lambda x:x):
        """
        Construct a new concordance index.

        @param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurance.
        @param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use C{key=lambda s:s.lower()}, then the index will be
            case-insensitive.
        """
        self._tokens = tokens
        """The document (list of tokens) that this concordance index
           was created from."""
        
        self._key = key
        """Function mapping each token to an index key (or None)."""
        
        self._offsets = defaultdict(list)
        """Dictionary mapping words (or keys) to lists of offset
           indices."""
        
        # Initialize the index (self._offsets)
        for index, word in enumerate(tokens):
            word = self._key(word)
            self._offsets[word].append(index)
Ejemplo n.º 6
0
    def train(labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100,
              support_cutoff=10, binary=False, feature_values=None,
              verbose=False):
        """
        @param binary: If true, then treat all feature/value pairs a
        individual binary features, rather than using a single n-way
        branch for each feature.
        """
        # Collect a list of all feature names.
        feature_names = set()
        for featureset, label in labeled_featuresets:
            for fname in featureset:
                feature_names.add(fname)

        # Collect a list of the values each feature can take.
        if feature_values is None and binary:
            feature_values = defaultdict(set)
            for featureset, label in labeled_featuresets:
                for fname, fval in featureset.items():
                    feature_values[fname].add(fval)

        # Start with a stump.
        if not binary:
            tree = DecisionTreeClassifier.best_stump(
                feature_names, labeled_featuresets, verbose)
        else:
            tree = DecisionTreeClassifier.best_binary_stump(
                feature_names, labeled_featuresets, feature_values, verbose)

        # Refine the stump.
        tree.refine(labeled_featuresets, entropy_cutoff, depth_cutoff-1,
                    support_cutoff, binary, feature_values, verbose)

        # Return it
        return tree
Ejemplo n.º 7
0
def mk_reldicts(pairs, window=5, trace=0):
    """
    Converts the pairs generated by L{mk_pairs} into a 'reldict': a dictionary which
    stores information about the subject and object NEs plus the filler between them.
    Additionally, a left and right context of length =< window are captured (within 
    a given input sentence).
    
    @param pairs: a pair of list(str) and L{Tree}, as generated by 
    @param window: a threshold for the number of items to include in the left and right context
    @type window: C{int}
    @return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
    @rtype: C{list} of C{defaultdict}
    """
    result = []
    while len(pairs) > 2:
        reldict = defaultdict(str)
        reldict['lcon'] = _join(pairs[0][0][-window:])
        reldict['subjclass'] = pairs[0][1].node
        reldict['subjtext'] = _join(pairs[0][1].leaves())
        reldict['subjsym'] = list2sym(pairs[0][1].leaves())
        reldict['filler'] = _join(pairs[1][0])
        reldict['objclass'] = pairs[1][1].node
        reldict['objtext'] = _join(pairs[1][1].leaves())
        reldict['objsym'] = list2sym(pairs[1][1].leaves())
        reldict['rcon'] = _join(pairs[2][0][:window])
        if trace:
            print "(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass'])
        result.append(reldict)
        pairs = pairs[1:]
    return result
Ejemplo n.º 8
0
    def _attempt_proof(self, clauses):
        #map indices to lists of indices, to store attempted unifications
        tried = defaultdict(list)

        i = 0
        while i < len(clauses):
            if not clauses[i].is_tautology():
                #since we try clauses in order, we should start after the last
                #index tried
                if tried[i]:
                    j = tried[i][-1] + 1
                else:
                    j = i + 1  #nothing tried yet for 'i', so start with the next

                while j < len(clauses):
                    #don't: 1) unify a clause with itself,
                    #       2) use tautologies
                    if i != j and j and not clauses[j].is_tautology():
                        tried[i].append(j)
                        newclauses = clauses[i].unify(clauses[j])
                        if newclauses:
                            for newclause in newclauses:
                                newclause._parents = (i + 1, j + 1)
                                clauses.append(newclause)
                                if not len(newclause
                                           ):  #if there's an empty clause
                                    return (True, clauses)
                            i = -1  #since we added a new clause, restart from the top
                            break
                    j += 1
            i += 1
        return (False, clauses)
Ejemplo n.º 9
0
def invert_dict(d):
    from nltk.compat import defaultdict
    inverted_dict = defaultdict(list)
    for key in d:
        for term in d[key]:
            inverted_dict[term].append(key)
    return inverted_dict
Ejemplo n.º 10
0
    def train(labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100,
              support_cutoff=10, binary=False, feature_values=None,
              verbose=False):
        """
        @param binary: If true, then treat all feature/value pairs a
        individual binary features, rather than using a single n-way
        branch for each feature.
        """
        # Collect a list of all feature names.
        feature_names = set()
        for featureset, label in labeled_featuresets:
            for fname in featureset:
                feature_names.add(fname)

        # Collect a list of the values each feature can take.
        if feature_values is None and binary:
            feature_values = defaultdict(set)
            for featureset, label in labeled_featuresets:
                for fname, fval in list(featureset.items()):
                    feature_values[fname].add(fval)

        # Start with a stump.
        if not binary:
            tree = DecisionTreeClassifier.best_stump(
                feature_names, labeled_featuresets, verbose)
        else:
            tree = DecisionTreeClassifier.best_binary_stump(
                feature_names, labeled_featuresets, feature_values, verbose)

        # Refine the stump.
        tree.refine(labeled_featuresets, entropy_cutoff, depth_cutoff-1,
                    support_cutoff, binary, feature_values, verbose)

        # Return it
        return tree
def mk_reldicts(pairs, window=5, trace=0):
    """
    Converts the pairs generated by L{mk_pairs} into a 'reldict': a dictionary which
    stores information about the subject and object NEs plus the filler between them.
    Additionally, a left and right context of length =< window are captured (within 
    a given input sentence).
    
    @param pairs: a pair of list(str) and L{Tree}, as generated by 
    @param window: a threshold for the number of items to include in the left and right context
    @type window: C{int}
    @return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon'
    @rtype: C{list} of C{defaultdict}
    """
    result = []
    while len(pairs) > 2:
        reldict = defaultdict(str)
        reldict['lcon'] = _join(pairs[0][0][-window:])
        reldict['subjclass'] = pairs[0][1].node
        reldict['subjtext'] = _join(pairs[0][1].leaves())
        reldict['subjsym'] = list2sym(pairs[0][1].leaves())
        reldict['filler'] = _join(pairs[1][0])
        reldict['objclass'] = pairs[1][1].node
        reldict['objtext'] = _join(pairs[1][1].leaves())
        reldict['objsym'] = list2sym(pairs[1][1].leaves())
        reldict['rcon'] = _join(pairs[2][0][:window])
        if trace:
            print "(rel(%s, %s)" % (reldict['subjclass'], reldict['objclass'])
        result.append(reldict)
        pairs = pairs[1:]
    return result
    def __init__(self, tokens, key=lambda x: x):
        """
        Construct a new concordance index.

        @param tokens: The document (list of tokens) that this
            concordance index was created from.  This list can be used
            to access the context of a given word occurance.
        @param key: A function that maps each token to a normalized
            version that will be used as a key in the index.  E.g., if
            you use C{key=lambda s:s.lower()}, then the index will be
            case-insensitive.
        """
        self._tokens = tokens
        """The document (list of tokens) that this concordance index
           was created from."""

        self._key = key
        """Function mapping each token to an index key (or None)."""

        self._offsets = defaultdict(list)
        """Dictionary mapping words (or keys) to lists of offset
           indices."""

        # Initialize the index (self._offsets)
        for index, word in enumerate(tokens):
            word = self._key(word)
            self._offsets[word].append(index)
Ejemplo n.º 13
0
 def _attempt_proof(self, clauses):
     #map indices to lists of indices, to store attempted unifications
     tried = defaultdict(list)
     
     i = 0
     while i < len(clauses):
         if not clauses[i].is_tautology():
             #since we try clauses in order, we should start after the last
             #index tried
             if tried[i]: 
                 j = tried[i][-1] + 1
             else: 
                 j = i + 1 #nothing tried yet for 'i', so start with the next
                 
             while j < len(clauses):
                 #don't: 1) unify a clause with itself, 
                 #       2) use tautologies
                 if i != j and j and not clauses[j].is_tautology():
                     tried[i].append(j) 
                     newclauses = clauses[i].unify(clauses[j])
                     if newclauses:
                         for newclause in newclauses:
                             newclause._parents = (i+1, j+1)
                             clauses.append(newclause)
                             if not len(newclause): #if there's an empty clause
                                 return (True, clauses) 
                         i=-1 #since we added a new clause, restart from the top 
                         break
                 j += 1
         i += 1
     return (False, clauses)
Ejemplo n.º 14
0
 def similar_words(self, word, n=20):
     scores = defaultdict(int)
     for c in self._word_to_contexts[self._key(word)]:
         for w in self._context_to_words[c]:
             if w != word:
                 print w, c, self._context_to_words[c][word], self._context_to_words[c][w]  
                 scores[w] += self._context_to_words[c][word] * self._context_to_words[c][w]  
     return sorted(scores, key=scores.get)[:n]
Ejemplo n.º 15
0
    def train(labeled_featuresets, estimator=ELEProbDist):
        """
        @param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples C{(featureset, label)}.
        """
        label_freqdist = FreqDist()
        feature_freqdist = defaultdict(FreqDist)
        feature_values = defaultdict(set)
        fnames = set()

        # Count up how many times each feature value occured, given
        # the label and featurename.
        for featureset, label in labeled_featuresets:
            label_freqdist.inc(label)
            for fname, fval in list(featureset.items()):
                # Increment freq(fval|label, fname)
                feature_freqdist[label, fname].inc(fval)
                # Record that fname can take the value fval.
                feature_values[fname].add(fval)
                # Keep a list of all feature names.
                fnames.add(fname)

        # If a feature didn't have a value given for an instance, then
        # we assume that it gets the implicit value 'None.'  This loop
        # counts up the number of 'missing' feature values for each
        # (label,fname) pair, and increments the count of the fval
        # 'None' by that amount.
        for label in label_freqdist:
            num_samples = label_freqdist[label]
            for fname in fnames:
                count = feature_freqdist[label, fname].N()
                feature_freqdist[label, fname].inc(None, num_samples-count)
                feature_values[fname].add(None)

        # Create the P(label) distribution
        label_probdist = estimator(label_freqdist)

        # Create the P(fval|label, fname) distribution
        feature_probdist = {}
        for ((label, fname), freqdist) in list(feature_freqdist.items()):
            probdist = estimator(freqdist, bins=len(feature_values[fname]))
            feature_probdist[label,fname] = probdist

        return NaiveBayesClassifier(label_probdist, feature_probdist)
 def similar_words(self, word, n=20):
     scores = defaultdict(int)
     for c in self._word_to_contexts[self._key(word)]:
         for w in self._context_to_words[c]:
             if w != word:
                 print w, c, self._context_to_words[c][
                     word], self._context_to_words[c][w]
                 scores[w] += self._context_to_words[c][
                     word] * self._context_to_words[c][w]
     return sorted(scores, key=scores.get)[:n]
Ejemplo n.º 17
0
def invert_dict(d):
    from nltk.compat import defaultdict
    inverted_dict = defaultdict(list)
    for key in d:
        if hasattr(d[key], '__iter__'):
            for term in d[key]:
                inverted_dict[term].append(key)
        else:
            inverted_dict[d[key]] = key
    return inverted_dict
Ejemplo n.º 18
0
def invert_dict(d):
    from nltk.compat import defaultdict
    inverted_dict = defaultdict(list)
    for key in d:
        if hasattr(d[key], '__iter__'):
            for term in d[key]:
                inverted_dict[term].append(key)
        else:
            inverted_dict[d[key]] = key
    return inverted_dict
Ejemplo n.º 19
0
 def _make_predicate_dict(self, assumptions):
     """
     Create a dictionary of predicates from the assumptions.
     
     @param assumptions: a C{list} of C{Expression}s
     @return: C{dict} mapping C{AbstractVariableExpression} to C{PredHolder}
     """
     predicates = defaultdict(PredHolder)
     for a in assumptions:
         self._map_predicates(a, predicates)
     return predicates
Ejemplo n.º 20
0
def analyze_entity_judgments(site):
    ''' Returns a mapping { entity ID -> { candidate link -> 
    (num turkers judged candidate relevant, num turkers judged it irrelevant) }} '''
    judgments = {} 
    
    # a mapping of turker id -> {candidate title -> true/false judgment} 
    # for each candidate annotated by the turker
    annotator_decisions = defaultdict(list)
    
    row_num = 0
    rows_plus_headers = csv_util.query_csv_for_rows(__entities_results_csv_path__, False)
    for row in rows_plus_headers:
        try:
            if row_num==0: # row 0 is header
                entity_id_col = row.index('Input.entity_id')
                candidate_link_col = row.index('Input.candidate_link') 
                
                turkerID_col = row.index('WorkerId')
                answer_col = row.index('Answer.Q1')
            else:
                judged_entity_id = row[entity_id_col]
                
                if judged_entity_id in judgments:
                    selected_candidates = judgments[judged_entity_id]
                else:
                    selected_candidates = {}
                    
                selected_candidate_title = wikipedia_api_util.get_page_title_from_url(row[candidate_link_col])
                if selected_candidate_title in selected_candidates:
                    (num_true, num_false) = selected_candidates[selected_candidate_title]
                else:
                    (num_true, num_false) = (0,0)
                    
                judgment = row[answer_col]
                if judgment=='true':
                    num_true = num_true+1
                else:
                    num_false = num_false+1
                selected_candidates[selected_candidate_title] = (num_true, num_false)
                judgments[judged_entity_id] = selected_candidates
                
                turkerID = row[turkerID_col]
                annotator_decisions[turkerID].append({selected_candidate_title:judgment})
                    
            row_num = row_num+1    
        except:
            continue # just ignore a problematic row   
        
    # Cache each annotator's decisions for later inter-rater agreement calculations
    entity_dataset_mgr.save_annotator_decisions(annotator_decisions, site)   
        
    print "Cached a total of "+str(len(judgments))+" entities judged by human Mechanical Turk annotators"
    entity_dataset_mgr.save_entity_judgements(judgments, site)
    return judgments
Ejemplo n.º 21
0
    def stump(feature_name, labeled_featuresets):
        label = FreqDist([label for (featureset, label) in labeled_featuresets]).max()

        # Find the best label for each value.
        freqs = defaultdict(FreqDist)  # freq(label|value)
        for featureset, label in labeled_featuresets:
            feature_value = featureset[feature_name]
            freqs[feature_value].inc(label)

        decisions = dict([(val, DecisionTreeClassifier(freqs[val].max())) for val in freqs])
        return DecisionTreeClassifier(label, feature_name, decisions)
Ejemplo n.º 22
0
    def stump(feature_name, labeled_featuresets):
        label = FreqDist(
            [label for (featureset, label) in labeled_featuresets]).max()

        # Find the best label for each value.
        freqs = defaultdict(FreqDist)  # freq(label|value)
        for featureset, label in labeled_featuresets:
            feature_value = featureset.get(featurename)
            freqs[feature_value].inc(label)

        decisions = dict([(val, DecisionTreeClassifier(freqs[val].max()))
                          for val in freqs])
        return DecisionTreeClassifier(label, feature_name, decisions)
Ejemplo n.º 23
0
    def _init(self):
        self._f2c = defaultdict(set)
        self._c2f = defaultdict(set)

        if self._pattern is not None:
            for file_id in self._fileids:
                category = re.match(self._pattern, file_id).group(1)
                self._add(file_id, category)

        elif self._map is not None:
            for (file_id, categories) in self._map.items():
                for category in categories:
                    self._add(file_id, category)

        elif self._file is not None:
            for line in self.open(self._file).readlines():
                line = line.strip()
                file_id, categories = line.split(self._delimiter, 1)
                if file_id not in self.fileids():
                    raise ValueError('In category mapping file %s: %s '
                                     'not found' % (self._file, file_id))
                for category in categories.split(self._delimiter):
                    self._add(file_id, category)
Ejemplo n.º 24
0
Archivo: api.py Proyecto: wrand/tweater
    def _init(self):
        self._f2c = defaultdict(set)
        self._c2f = defaultdict(set)

        if self._pattern is not None:
            for file_id in self._fileids:
                category = re.match(self._pattern, file_id).group(1)
                self._add(file_id, category)

        elif self._map is not None:
            for (file_id, categories) in self._map.items():
                for category in categories:
                    self._add(file_id, category)

        elif self._file is not None:
            for line in self.open(self._file).readlines():
                line = line.strip()
                file_id, categories = line.split(self._delimiter, 1)
                if file_id not in self.fileids():
                    raise ValueError('In category mapping file %s: %s '
                                     'not found' % (self._file, file_id))
                for category in categories.split(self._delimiter):
                    self._add(file_id, category)
Ejemplo n.º 25
0
 def createSenseTree(self, senseList):
     '''
     create parse tree for all senses in senseList
     eg: {'conduct': ['institution', 'to', 'business'], 
     'ROOT': ['created'], 'institution': ['an'], 'created': ['institution', 'conduct']}
     '''
     senseDict = []
     depParsed = parseSenses(senseList)
     for dep in depParsed:
         temp = defaultdict( list )
         for n ,v in dep:
             n = stemWords(n)
             v = stemWords(v)
             temp[n].append(v)
         senseDict.append(temp)
     return senseDict
Ejemplo n.º 26
0
def page_from_reference(href):
    '''
    Returns a tuple of the HTML page built and the new current word

    @param href: The hypertext reference to be solved
    @type href: str
    @return: A tuple (page,word), where page is the new current HTML page
             to be sent to the browser and
             word is the new current word
    @rtype: A tuple (str,str)
    '''
    word = href.word
    pos_forms = defaultdict(list)
    words = word.split(',')
    words = [
        w for w in [w.strip().lower().replace(' ', '_') for w in words]
        if w != ""
    ]
    if len(words) == 0:
        # No words were found.
        return "", "Please specify a word to search for."

    # This looks up multiple words at once.  This is probably not
    # necessary and may lead to problems.
    for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]:
        form = wn.morphy(w, pos)
        if form and form not in pos_forms[pos]:
            pos_forms[pos].append(form)
    body = ''
    for pos, pos_str, name in _pos_tuples():
        if pos in pos_forms:
            body += _hlev(3, name) + '\n'
            for w in pos_forms[pos]:
                # Not all words of exc files are in the database, skip
                # to the next word if a KeyError is raised.
                try:
                    body += _collect_all_synsets(w, pos, href.synset_relations)
                except KeyError:
                    pass
    if not body:
        body = "The word or words '%s' where not found in the dictonary." % word
    return body, word
Ejemplo n.º 27
0
def page_from_reference(href):
    '''
    Returns a tuple of the HTML page built and the new current word

    @param href: The hypertext reference to be solved
    @type href: str
    @return: A tuple (page,word), where page is the new current HTML page
             to be sent to the browser and
             word is the new current word
    @rtype: A tuple (str,str)
    '''
    word = href.word
    pos_forms = defaultdict(list)
    words = word.split(',')
    words = [w for w in [w.strip().lower().replace(' ', '_') 
                         for w in words]
             if w != ""]
    if len(words) == 0:
        # No words were found.
        return "", "Please specify a word to search for."
    
    # This looks up multiple words at once.  This is probably not
    # necessary and may lead to problems.
    for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]:
        form = wn.morphy(w, pos)
        if form and form not in pos_forms[pos]:
            pos_forms[pos].append(form)
    body = ''
    for pos,pos_str,name in _pos_tuples():
        if pos in pos_forms:
            body += _hlev(3, name) + '\n'
            for w in pos_forms[pos]:
                # Not all words of exc files are in the database, skip
                # to the next word if a KeyError is raised.
                try:
                    body += _collect_all_synsets(w, pos, href.synset_relations)
                except KeyError:
                    pass
    if not body:
        body = "The word or words '%s' where not found in the dictonary." % word
    return body, word
Ejemplo n.º 28
0
 def dictOfDicts():
     return defaultdict(dictOfDicts)
Ejemplo n.º 29
0
def get_resolved_ambiguous_entities():
    """ Returns the ambiguous entities for which the intended 
    meaning has been unanimously resolved by human annotators. """

    all_entities = defaultdict(list)
    correct_meaning_label = "Y"

    row_count = -1
    labeled_entities_dataset = csv_util.query_csv_for_rows("labeled_data/entities.csv", False)
    for candidate_row in labeled_entities_dataset:
        row_count = row_count + 1
        if row_count == 0:
            # header row
            surfaceform_col = candidate_row.index("surface_form")
            shorttext_col = candidate_row.index("short_text")

            candidate_meaning_col = candidate_row.index("candidate_meaning")
            candidate_label_col = candidate_row.index("candidate_is_relevant")

            userkey_col = candidate_row.index("user_key")
            continue

        # use "surfaceform_shorttext" as ID for entity
        surfaceform = candidate_row[surfaceform_col]
        shorttext = candidate_row[shorttext_col]
        entity_id = surfaceform + "_" + shorttext

        meaning = candidate_row[candidate_meaning_col]
        label = candidate_row[candidate_label_col]
        userkey = candidate_row[userkey_col]
        all_entities[entity_id].append((meaning, label, surfaceform, shorttext, userkey))

    # test if entity is ambiguous (i.e., has more than one candidate meaning) and
    # if so if entity has been resolved (i.e., has at least one candidate labeled
    # as the intended meaning)
    resolved_entities = {}
    for entity in all_entities:
        entity_tuple_list = all_entities[entity]
        if len(entity_tuple_list) < 2:
            continue

        candidate_meanings = []
        intended_meanings = []
        user = None
        for (meaning, label, surfaceform, shorttext, userkey) in entity_tuple_list:

            # title of a potential meaning of the ambiguous entity
            if not meaning in candidate_meanings:
                candidate_meanings.append(meaning)

            # annotated label indicating whether this candidate
            # meaning is the intended meaning of the entity
            if label == correct_meaning_label and not meaning in intended_meanings:
                intended_meanings.append(meaning)

            if user is None:
                user = userkey
        if len(intended_meanings) > 1 and len(intended_meanings) > 0 and user != None:
            # this entity is ambiguous, has been manually resolved,
            # and we know the user who wrote it
            entity_obj = ResolvedEntity(candidate_meanings, intended_meanings, surfaceform, shorttext, user)
            entity_id = entity_obj.get_id()
            resolved_entities[entity_id] = entity_obj
    return resolved_entities
Ejemplo n.º 30
0
def get_resolved_ambiguous_entities():
    ''' Returns the ambiguous entities for which the intended 
    meaning has been unanimously resolved by human annotators. '''
    
    all_entities = defaultdict(list)
    correct_meaning_label = 'Y'
    
    row_count = -1
    labeled_entities_dataset = csv_util.query_csv_for_rows('labeled_data/entities.csv', False)
    for candidate_row in labeled_entities_dataset:
        row_count = row_count+1
        if row_count==0:
            # header row
            surfaceform_col = candidate_row.index('surface_form')
            shorttext_col = candidate_row.index('short_text')
            
            candidate_meaning_col = candidate_row.index('candidate_meaning')
            candidate_label_col = candidate_row.index('candidate_is_relevant')
            
            userkey_col = candidate_row.index('user_key')
            continue
        
        # use "surfaceform_shorttext" as ID for entity
        surfaceform = candidate_row[surfaceform_col]
        shorttext = candidate_row[shorttext_col]
        entity_id = surfaceform+'_'+shorttext
        
        meaning = candidate_row[candidate_meaning_col]
        label = candidate_row[candidate_label_col] 
        userkey = candidate_row[userkey_col]
        all_entities[entity_id].append((meaning, label, surfaceform, shorttext, userkey))
        
    # test if entity is ambiguous (i.e., has more than one candidate meaning) and
    # if so if entity has been resolved (i.e., has at least one candidate labeled
    # as the intended meaning)
    resolved_entities = {}
    for entity in all_entities:
        entity_tuple_list = all_entities[entity]
        if len(entity_tuple_list) < 2:
            continue 
        
        candidate_meanings = []
        intended_meanings = []
        user = None
        for (meaning, label, surfaceform, shorttext, userkey) in entity_tuple_list:
            
            # title of a potential meaning of the ambiguous entity
            if not meaning in candidate_meanings:
                candidate_meanings.append(meaning)
            
            # annotated label indicating whether this candidate 
            # meaning is the intended meaning of the entity
            if label==correct_meaning_label and not meaning in intended_meanings:
                intended_meanings.append(meaning)
            
            if user is None:
                user = userkey
        if len(intended_meanings)>1 and len(intended_meanings)>0 and user!=None:
            # this entity is ambiguous, has been manually resolved, 
            # and we know the user who wrote it
            entity_obj = ResolvedEntity(candidate_meanings, intended_meanings, surfaceform, shorttext, user)
            entity_id = entity_obj.get_id()
            resolved_entities[entity_id] = entity_obj
    return resolved_entities
Ejemplo n.º 31
0
 def dictOfDicts():
     return defaultdict(dictOfDicts)
Ejemplo n.º 32
0
def make_tweet_entities_csv_for_turk():
    twitter_site = short_text_websites.get_twitter_site()
    entities_to_evaluate = entity_dataset_mgr.get_valid_ne_candidates(twitter_site)
    if entities_to_evaluate is None:
        print "No ambiguous entities + candidates in cache. Run run_all_dataset_generators "+\
        "script and choose to first fetch and store more entities from short texts."
        return
    
    judged_row_plus_headers = csv_util.query_csv_for_rows(__entities_results_csv_path__, False)
    judged_row_num = 0
    already_judged = [] # list of (entity id, candidate link)
    for judge_row in judged_row_plus_headers:
        try:
            if judged_row_num==0: # row 0 is header
                entity_id_col = judge_row.index('Input.entity_id')
                candidate_link_col = judge_row.index('Input.candidate_link') 
            else:
                judged_tuple = (judge_row[entity_id_col], judge_row[candidate_link_col])
                if not judged_tuple in already_judged:
                    already_judged.append(judged_tuple)
            judged_row_num = judged_row_num+1    
        except:
            continue # just ignore a problematic row      
        
    # Determine what entity+candidate tasks we actually want to write to a spreadsheet 
    # and send to mturk since we don't have resources for unlimited mturk tasks
    tasks = {} # NamedEntity object -> candidate judgment tasks we actually want performed
    user_entities = defaultdict(list) # username -> [NamedEntity obj]
    done_shorttexts = [] # list of shorttext id
    random.shuffle(entities_to_evaluate) # so we get a random subset of a user's entities
    for ne_obj in entities_to_evaluate:
        
        # "40 nouns usually enough to establish statistically significant 
        # differences between WSD algorithms" (Santamaria et al., 2010)
        username = ne_obj.username
        if len(user_entities[username]) > 50:
            continue # have enough entities for this user
        
        # limiting our dataset to one named entity per short text
        shorttext_id = ne_obj.shorttext_id
        if shorttext_id in done_shorttexts:
            continue
        
        # no need to create tasks for candidates we already have annotator judgments for
        entity_id = ne_obj.get_entity_id()
        candidate_URLs = ne_obj.get_candidate_wikiURLs()
        valid_candidate_tasks = []
        for candidate_URL in candidate_URLs:
            if ((entity_id, candidate_URL) in already_judged):
                continue
            valid_candidate_tasks.append(candidate_URL)
        if len(valid_candidate_tasks)==0:
            continue # already have annotator judgments for all of this entity's candidates
        if len(candidate_URLs)+len(valid_candidate_tasks) < 2:
            # this would be a non-ambiguous entity, and we should never reach this 
            # point because such entities should have been filtered out by now
            raise
        tasks[entity_id] = valid_candidate_tasks
        user_entities[username].append(ne_obj)
        done_shorttexts.append(shorttext_id)
            
    # put valid entities + candidates in the spreadsheet until reach our limit of tasks
    task_max = 1400    
    
    rows = []
    headers = ['entity_id', 'short_text', 'ambiguous_entity', 'candidate_link']
    rows.append(headers)
    
    for username in user_entities:
        
        # add users until reach our limit on the number of tasks we can afford, 
        # but break at this point in the loop rather than in the inner loop to
        # ensure that we do have at least 50 entities per user (even if this
        # means we go over our task limit a little in order to reach that amount)
        if len(rows) > task_max:
            break
        
        # bypass users who haven't written the minimum number of valid entities
        # required to establish statistical significance between the algorithms
        if len(user_entities[username]) < 50:
            continue
        
        # should be 50 NamedEntity objects per user, and we'll make tasks for their candidates
        for ne_obj in user_entities[username]:
            entity_id = ne_obj.get_entity_id()
        
            # make sure the entity presented to a Turker looks the same as
            # it appears in the short text (ie with the same capitalization)
            original_shorttext = ne_obj.shorttext_str.decode('latin-1')
            surface_form = ne_obj.surface_form
            if not surface_form in original_shorttext:
                surface_form = __match_appearance__(surface_form, original_shorttext)
            
            # shuffle candidates so that they don't appear
            # in wikiminer's/dbpedia's ranking order and bias the turker
            candidate_URLs = tasks[entity_id]
            random.shuffle(candidate_URLs)
            choices = candidate_URLs[:] # copy (list slicing)
            for choice in choices:
                # make a separate row for each candidate link 
                # rather than putting all links in a single cell
                row = [entity_id, original_shorttext, surface_form, choice]
                rows.append(row)
            
            if len(rows)%50==0:
                # write the rows every once in a while in case we reach an error
                print "Updating spreadsheet..."+str(len(rows))
                csv_util.write_to_spreadsheet(__entities_to_judge_csv_path__, rows)
        
    # dump to csv
    csv_util.write_to_spreadsheet(__entities_to_judge_csv_path__, rows)
Ejemplo n.º 33
0
    def train(self, feats):
        print "Starting to train the data"
        start = datetime.datetime.now()

        print "setting the ids", datetime.datetime.now()
        self.negids = movie_reviews.fileids('neg')
        self.posids = movie_reviews.fileids('pos')
        #random.shuffle(self.negids)
        #random.shuffle(self.posids)
        ##self.reviews = ([(movie_reviews.words(fileids=[f]), 'neg') for f in self.negids] +
        ##[(movie_reviews.words(fileids=[f]), 'pos') for f in self.posids])
        ##random.shuffle(self.reviews)

        ##self.train_set = apply_features(feats, self.reviews[len(self.reviews)*1/4:])
        ##self.test_set = apply_features(feats, self.reviews[:len(self.reviews)*1/4])

        print "setting the feats", datetime.datetime.now()
        self.negfeats = [(feats(movie_reviews.words(fileids=[f])), 'neg')
                         for f in self.negids]
        self.posfeats = [(feats(movie_reviews.words(fileids=[f])), 'pos')
                         for f in self.posids]

        self.negcutoff = len(self.negfeats) * 3 / 4
        self.poscutoff = len(self.posfeats) * 3 / 4

        print "setting the train/test", datetime.datetime.now()
        self.trainfeats = self.negfeats[:self.
                                        negcutoff] + self.posfeats[:self.
                                                                   poscutoff]
        self.testfeats = self.negfeats[self.negcutoff:] + self.posfeats[
            self.poscutoff:]

        print "training", datetime.datetime.now()
        self.classifier = NaiveBayesClassifier.train(self.trainfeats)
        ##self.classifier = NaiveBayesClassifier.train(self.train_set)
        self.refsets = defaultdict(set)
        self.testsets = defaultdict(set)

        print "accuracy stuff", datetime.datetime.now()
        for i, (feats, label) in enumerate(self.testfeats):
            ##for i, (feats, label) in enumerate(self.test_set):
            self.refsets[label].add(i)
            observed = self.classifier.classify(feats)
            self.testsets[observed].add(i)

        end = datetime.datetime.now()
        print "Training lasted for ", end - start

        print 'accuracy:', nltk.classify.util.accuracy(self.classifier,
                                                       self.testfeats)
        ##print 'accuracy:', nltk.classify.util.accuracy(self.classifier, self.test_set)
        print 'pos precision:', nltk.metrics.precision(self.refsets['pos'],
                                                       self.testsets['pos'])
        print 'pos recall:', nltk.metrics.recall(self.refsets['pos'],
                                                 self.testsets['pos'])
        print 'neg precision:', nltk.metrics.precision(self.refsets['neg'],
                                                       self.testsets['neg'])
        print 'neg recall:', nltk.metrics.recall(self.refsets['neg'],
                                                 self.testsets['neg'])
        self.classifier.show_most_informative_features()
        self.trained = True
Ejemplo n.º 34
0
def compare_ranking_precision(resolved_entities):

    # the total number of entities we have unanimous annotator judgments for
    total_evaluated = 0
    
    # the total number of entities for which turkers
    # could not agree upon the correct candidate
    annotator_disagreement = 0
    gold_correct = 0
    
    # the total number of entities for which our algorithms and the 
    # baseline ranking techniques selected the correct candidate
    reslve_algs_correct = defaultdict(int) # alg id -> # times correct
    nonmatch_algs_baseline_correct = defaultdict(int)
    wikiminer_correct = 0
    dbpedia_correct = 0
    random_correct = 0
    
    toolkit_failures = 0
    reslve_success_when_toolkits_fail = 0

    for resolved_entity in resolved_entities:
        gold_standard_candidates = resolved_entity.get_unanimous_candidates_goldstandard()
        if len(gold_standard_candidates)==0:
            annotator_disagreement = annotator_disagreement+1
            continue # turkers couldn't agree on this entity
        
        is_wikiminer_correct = resolved_entity.is_baseline_wikiminer_correct()
        is_dbpedia_correct = resolved_entity.is_baseline_dbpedia_correct()
        
        for alg_id in resolved_entity.reslve_rankings.keys():
            
            # check if RESLVE algorithm selected the correct candidate
            is_reslve_algs_correct = resolved_entity.is_reslve_correct(alg_id)
            if is_reslve_algs_correct:
                reslve_algs_correct[alg_id] = reslve_algs_correct[alg_id]+1
                
            # run the same RESLVE algorithm but use a random non-matching user who
            # doesn't provide the user interest model we claim is so relevant and
            # valuable (ie we want to make sure that just incorporating any random 
            # wikipedia data isn't the main reason for any good performance we see) 
            if resolved_entity.is_baseline_reslve_nonmatch_correct(alg_id):
                nonmatch_algs_baseline_correct[alg_id] = nonmatch_algs_baseline_correct[alg_id]+1
        
            # measure whether when toolkits are wrong, RESLVE can perform correctly
            if not is_wikiminer_correct and not is_dbpedia_correct:
                toolkit_failures = toolkit_failures+1
                if is_reslve_algs_correct:
                    reslve_success_when_toolkits_fail = reslve_success_when_toolkits_fail+1                
            
        # check performance of the base line strategies
        if is_wikiminer_correct:
            wikiminer_correct = wikiminer_correct+1
        if is_dbpedia_correct:
            dbpedia_correct = dbpedia_correct+1
        if resolved_entity.is_baseline_random_correct():
            random_correct = random_correct+1
        if resolved_entity.is_goldstandard_correct():
            gold_correct = gold_correct+1  
        
        total_evaluated = total_evaluated+1     
    
    wikiminer_accuracy = float(wikiminer_correct)/float(total_evaluated)
    print "Wikipedia Miner precision: "+str(wikiminer_accuracy)
    
    dbpedia_accuracy = float(dbpedia_correct)/float(total_evaluated)
    print "DBPedia Spotlight precision: "+str(dbpedia_accuracy)
    
    random_accuracy = float(random_correct)/float(total_evaluated)
    print "Random baseline precision: "+str(random_accuracy)
    
    gold_accuracy = float(gold_correct)/float(total_evaluated)
    print "Human annotator ability to reach consensus: "+str(gold_accuracy)    
    
    for alg_id in resolved_entity.reslve_rankings.keys():
        reslve_correct = reslve_algs_correct[alg_id]
        reslve_accuracy = float(reslve_correct)/float(total_evaluated)
        print "RESLVE "+alg_id+" precision: "+str(reslve_accuracy)
        
        nonmatch_baseline_correct = nonmatch_algs_baseline_correct[alg_id]
        nonmatch_baseline_accuracy = float(nonmatch_baseline_correct)/float(total_evaluated)
        print "RESLVE nonmatch baseline using "+alg_id+" precision: "+str(nonmatch_baseline_accuracy)
        
        # improvement achieved by incorporating the user interest model
        if nonmatch_baseline_correct==0:
            improvement_str =  "Infinite (non match baseline failed to correctly resolve any entity)"
        else:
            matching_user_improvement = float(reslve_correct-nonmatch_baseline_correct)/float(nonmatch_baseline_correct)
            improvement_str = str(matching_user_improvement)
        print "Improvement boost by incorporating user interest model into RESLVE's "+\
        str(alg_id)+": "+str(improvement_str)
        
    if toolkit_failures==0:
        print "Toolkits performed with 100% accuracy.."    
    else:
        tough_cases_improvement = float(reslve_success_when_toolkits_fail)/float(toolkit_failures)
        print "RESLVE able to achieve "+str(tough_cases_improvement)+\
        " precision in the difficult cases when Wikipedia Miner and DBPedia Spotlight fail completely."