コード例 #1
0
ファイル: tagger_base.py プロジェクト: cedebrun/gt-nlp-class
def apply_tagger(tagger,outfilename,all_tags=None,trainfile=TRAIN_FILE,testfile=DEV_FILE):
    if all_tags is None:
        all_tags = set()
        
        # this is slow
        for i,(words, tags) in enumerate(preproc.conll_seq_generator(trainfile)):
            for tag in tags:
                all_tags.add(tag)
            
    with open(outfilename,'w') as outfile:
        for words,_ in preproc.conll_seq_generator(testfile):
            pred_tags = tagger(words,all_tags)
            for i,tag in enumerate(pred_tags):
                outfile.write(tag+'\n')
            outfile.write('\n')
コード例 #2
0
def get_most_common_word_weights(trainfile):
    """
    Return a set of weights, so that each word is tagged by its most frequent tag in the training file.
    If the word does not appear in the training file, the weights should be set so that the output tag is Noun.
    
    Parameters:
    trainfile: -- training file
    :returns: -- classification weights
    :rtype: -- defaultdict

    """
    weights = defaultdict(float)

    word_counters = defaultdict(lambda: Counter())
    tag_counters = defaultdict(lambda: Counter())

    for words, tags in conll_seq_generator(trainfile):
        for tag, word in zip(tags, words):
            word_counters[word].update([tag])
            tag_counters[tag].update([word])

    for word in word_counters:
        for tag in word_counters[word]:
            weights[(tag, word)] = word_counters[word][tag]

    for tag in tag_counters:
        weights[(tag, OFFSET)] = len(list(tag_counters[tag].elements()))

    return weights
コード例 #3
0
ファイル: test_basics.py プロジェクト: cedebrun/gt-nlp-class
def test_correct_number_of_tags():
    ## Demo
    all_tags = set()
    for i,(words, tags) in enumerate(preproc.conll_seq_generator(TRAIN_FILE,max_insts=100000)):
        for tag in tags:
            all_tags.add(tag)
    eq_(len(all_tags),10)
コード例 #4
0
ファイル: tagger_base.py プロジェクト: cedebrun/gt-nlp-class
def apply_model(model,outfilename,word_to_ix, all_tags=None,trainfile=TRAIN_FILE,testfile=DEV_FILE):
    """
    applies the model on the data and writes the best sequence of tags to the outfile
    """
    if all_tags is None:
        all_tags = set()
        
        # this is slow
        for i,(words, tags) in enumerate(preproc.conll_seq_generator(trainfile)):
            for tag in tags:
                all_tags.add(tag)
            
    with open(outfilename,'w') as outfile:
        for words,_ in preproc.conll_seq_generator(testfile):
            seq_words = bilstm.prepare_sequence(words, word_to_ix)
            pred_tags = model.predict(seq_words)
            for i,tag in enumerate(pred_tags):
                outfile.write(tag+'\n')
            outfile.write('\n')
コード例 #5
0
def get_tag_word_counts(filename):
    """build a dict of counters, one per tag, counting the words that go with each tag
    """
    all_counters = defaultdict(lambda: Counter())
    all_tags = set()
    for i, (words,
            tags) in enumerate(conll_seq_generator(filename,
                                                   max_insts=100000)):
        for word, tag in zip(words, tags):
            all_counters[tag][word] += 1
    return all_counters
コード例 #6
0
def get_tag_word_counts(trainfile):
    """
    Produce a Counter of occurences of word for each tag
    
    Parameters:
    trainfile: -- the filename to be passed as argument to conll_seq_generator
    :returns: -- a default dict of counters, where the keys are tags.
    """
    all_counters = defaultdict(lambda: Counter())
    for words, tags in conll_seq_generator(trainfile):
        for tag, word in zip(tags, words):
            all_counters[tag].update([word])
    return all_counters
コード例 #7
0
ファイル: test_bilstm.py プロジェクト: cedebrun/gt-nlp-class
def setup():
    global word_to_ix, tag_to_ix, X_tr, Y_tr, model, embedding_dim
    
    vocab, word_to_ix = most_common.get_word_to_ix(TRAIN_FILE, max_size=6500)
    tag_to_ix={}
    for i,(words,tags) in enumerate(preproc.conll_seq_generator(TRAIN_FILE)):
        for tag in tags:
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)
    
    
    torch.manual_seed(765);
    
    embedding_dim=30
    hidden_dim=30
    model = bilstm.BiLSTM(len(word_to_ix),tag_to_ix,embedding_dim, hidden_dim)
    
    X_tr = []
    Y_tr = []
    for i,(words,tags) in enumerate(preproc.conll_seq_generator(TRAIN_FILE)):
        X_tr.append(words)
        Y_tr.append(tags)
コード例 #8
0
ファイル: tagger_base.py プロジェクト: cedebrun/gt-nlp-class
def apply_tagger(tagger,outfilename=None,all_tags=None,trainfile=TRAIN_FILE,testfile=DEV_FILE,kaggle=False):
    if all_tags is None:
       all_tags = set()

       # this is slow
       for i,(words, tags) in enumerate(preproc.conll_seq_generator(trainfile)):
           for tag in tags:
               all_tags.add(tag)
        
    with open(outfilename,'w') as outfile:
        idx = 1
        if kaggle:
            outfile.write("Id,Prediction")
        for words,_ in preproc.conll_seq_generator(testfile):
            pred_tags = tagger(words,all_tags)
            if kaggle:
                for tag in pred_tags:
                    outfile.write("\n")
                    outfile.write(str(idx) + "," + tag)
                    idx += 1
            else:
                for i,tag in enumerate(pred_tags):
                    print >>outfile, tag
                print >>outfile, ""
コード例 #9
0
ファイル: most_common.py プロジェクト: arbylee1/gt-nlp-class
def get_tag_trans_counts(trainfile):
    """compute a dict of counters for tag transitions

    :param trainfile: name of file containing training data
    :returns: dict, in which keys are tags, and values are counters of succeeding tags
    :rtype: dict

    """
    tot_counts = defaultdict(lambda : Counter())
    for _,tags in conll_seq_generator(trainfile):
        tags = [START_TAG] + tags + [END_TAG]
        tag_trans = zip(tags[:-1],tags[1:])
        for prev_tag, curr_tag in tag_trans:
            tot_counts[prev_tag][curr_tag] += 1

    return dict(tot_counts)
コード例 #10
0
ファイル: most_common.py プロジェクト: cedebrun/gt-nlp-class
def get_tag_trans_counts(trainfile):
    """compute a dict of counters for tag transitions

    :param trainfile: name of file containing training data
    :returns: dict, in which keys are tags, and values are counters of succeeding tags
    :rtype: dict

    """
    tot_counts = defaultdict(lambda : Counter())
    for _,tags in conll_seq_generator(trainfile):
        tags = [START_TAG] + tags + [END_TAG]
        tag_trans = zip(tags[:-1],tags[1:])
        for prev_tag, curr_tag in tag_trans:
            tot_counts[prev_tag][curr_tag] += 1

    return dict(tot_counts)
コード例 #11
0
def setup():
    global all_tags, theta_clf_hand, toy_data
    all_tags = set()
    for i,(words, tags) in enumerate(preproc.conll_seq_generator(TRAIN_FILE,max_insts=100000)):
        for tag in tags:
            all_tags.add(tag)

    theta_clf_hand = defaultdict(float,
                                 {('NOUN',OFFSET):0.1,
                                  ('PRON',CURR_WORD_FEAT,'They'):1,
                                  ('PRON',CURR_WORD_FEAT,'can'):-1,
                                  ('NOUN',CURR_WORD_FEAT,'fish'):1,
                                  ('VERB',CURR_WORD_FEAT,'fish'):0.5})

    toy_data = [('They can fish'.split(),['PRON','AUX','VERB']),
                ('the old man the boat'.split(),['DET','NOUN','VERB','DET','NOUN'])]
コード例 #12
0
ファイル: most_common.py プロジェクト: arbylee1/gt-nlp-class
def get_tag_word_counts(filename):
    """build a dict of counters, one per tag, counting the words that go with each tag

    :param trainfile: training data
    :returns: dict of counters
    :rtype: dict

    """
    all_counters = defaultdict(lambda : Counter())

    # your code here
    # hint: for words, tags in enumerate(preproc.conll_seq_generator(TRAIN_FILE)):

    for words, tags in enumerate(preproc.conll_seq_generator(filename)):
        for word, tag in zip(tags[0], tags[1]):
            all_counters[tag][word] += 1
    return all_counters
コード例 #13
0
def get_tag_trans_counts(trainfile):
    """compute a dict of counters for tag transitions

    :param trainfile: name of file containing training data
    :returns: dict, in which keys are tags, and values are counters of succeeding tags
    :rtype: dict

    """
    gen = conll_seq_generator(trainfile)
    tot_counts = defaultdict(lambda: Counter())
    for (words, tags) in gen:
        tot_counts[START_TAG].update({tags[0]: 1})
        for i in range(len(tags) - 1):
            tot_counts[tags[i]].update({tags[i + 1]: 1})
        tot_counts[tags[len(tags) - 1]].update({END_TAG: 1})
    #tot_counts.pop(END_TAG)

    return dict(tot_counts)
コード例 #14
0
def get_nb_weights(trainfile, smoothing):
    """
    estimate_nb function assumes that the labels are one for each document, where as in POS tagging: we have labels for 
    each particular token. So, in order to calculate the emission score weights: P(w|y) for a particular word and a 
    token, we slightly modify the input such that we consider each token and its tag to be a document and a label. 
    The following helper code converts the dataset to token level bag-of-words feature vector and labels. 
    The weights obtained from here will be used later as emission scores for the viterbi tagger.
    
    inputs: train_file: input file to obtain the nb_weights from
    smoothing: value of smoothing for the naive_bayes weights
    
    :returns: nb_weights: naive bayes weights
    """
    token_level_docs = []
    token_level_tags = []
    for words, tags in preproc.conll_seq_generator(trainfile):
        token_level_docs += [{word: 1} for word in words]
        token_level_tags += tags
    nb_weights = estimate_nb(token_level_docs, token_level_tags, smoothing)
    return nb_weights
コード例 #15
0
def get_tag_trans_counts(trainfile):
    """compute a dict of counters for tag transitions

    :param trainfile: name of file containing training data
    :returns: dict, in which keys are tags, and values are counters of succeeding tags
    :rtype: dict

    """

    tot_counts = defaultdict(lambda: Counter())

    for index, (words, tags) in enumerate(conll_seq_generator(trainfile)):
        for index, tag in enumerate(tags):
            if index == 0:
                tot_counts[START_TAG].update([tag])
            if index == len(tags) - 1:
                tot_counts[tag].update([END_TAG])
            else:
                tot_counts[tag].update([tags[index + 1]])

    return dict(tot_counts)
コード例 #16
0
ファイル: naive_bayes.py プロジェクト: cedebrun/gt-nlp-class
def get_nb_weights(trainfile, smoothing):
    """
    estimate_nb function assumes that the labels are one for each document, where as in POS tagging: we have labels for 
    each particular token. So, in order to calculate the emission score weights: P(w|y) for a particular word and a 
    token, we slightly modify the input such that we consider each token and its tag to be a document and a label. 
    The following helper code converts the dataset to token level bag-of-words feature vector and labels. 
    The weights obtained from here will be used later as emission scores for the viterbi tagger.
    
    inputs: train_file: input file to obtain the nb_weights from
    smoothing: value of smoothing for the naive_bayes weights
    
    :returns: nb_weights: naive bayes weights
    """
    token_level_docs=[]
    token_level_tags=[]
    for words,tags in preproc.conll_seq_generator(trainfile):
        token_level_docs += [{word:1} for word in words]
        token_level_tags +=tags
    nb_weights = estimate_nb(token_level_docs, token_level_tags, smoothing)
    
    return nb_weights
コード例 #17
0
ファイル: most_common.py プロジェクト: arbylee1/gt-nlp-class
def get_most_common_word_weights(trainfile):
    """Return a set of weights, so that each word is tagged by its most frequent tag in the training file.
    If the word does not appear in the training file, the weights should be set so that the output tag is Noun.

    :param trainfile: training file
    :returns: classification weights
    :rtype: defaultdict

    """
    all_counters = defaultdict(lambda: Counter())
    for words, tags in enumerate(preproc.conll_seq_generator(trainfile)):
        for word, tag in zip(tags[0], tags[1]):
            all_counters[word][tag] += 1
    weights = defaultdict(float)
    for word in all_counters.keys():
        best_tag = (0, u'NOUN')
        for tag in all_counters[word].keys():
            if all_counters[word][tag] > best_tag[0]:
                best_tag = all_counters[word][tag], tag
        weights[(best_tag[1], word)] = 1.0
    weights[u'NOUN', OFFSET] = 0.99
    return weights
コード例 #18
0
ファイル: most_common.py プロジェクト: cedebrun/gt-nlp-class
def get_word_to_ix(input_file, max_size=100000):
    """
    creates a vocab that has the list of most frequent occuring words such that the size of the vocab <=max_size, 
    also adds an UNK token to the Vocab and then creates a dictionary that maps each word to a unique index, 
    :returns: vocab, dict
    vocab: list of words in the vocabulary
    dict: maps word to unique index
    """
    vocab_counter=Counter()
    for words,tags in conll_seq_generator(input_file):
        for word,tag in zip(words,tags):
            vocab_counter[word]+=1
    vocab = [ word for word,val in vocab_counter.most_common(max_size-1)]
    vocab.append(UNK)
    
    word_to_ix={}
    ix=0
    for word in vocab:
        word_to_ix[word]=ix
        ix+=1
    
    return vocab, word_to_ix
コード例 #19
0
ファイル: most_common.py プロジェクト: cedebrun/gt-nlp-class
def get_tag_to_ix(input_file):
    """
    creates a dictionary that maps each tag (including the START_TAG and END_TAG to a unique index and vice-versa
    :returns: dict1, dict2
    dict1: maps tag to unique index
    dict2: maps each unique index to its own tag
    """
    tag_to_ix={}
    for i,(words,tags) in enumerate(conll_seq_generator(input_file)):
        for tag in tags:
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)
    
    #adding START_TAG and END_TAG
    #if START_TAG not in tag_to_ix:
    #    tag_to_ix[START_TAG] = len(tag_to_ix)
    #if END_TAG not in tag_to_ix:
    #    tag_to_ix[END_TAG] = len(tag_to_ix)
    
    ix_to_tag = {v:k for k,v in tag_to_ix.items()}
    
    return tag_to_ix, ix_to_tag