コード例 #1
0
ファイル: models.py プロジェクト: ml-lab/pun-model
 def _parse_unigrams(self, path):
     print('parsing unigram data...')
     data = data2numpy(path)
     for row in data:
         word = row[0].lower()
         unigram = float(row[1])
         self.unigram[word] = unigram
     print('done')
     return
コード例 #2
0
 def __init__(self,
              relatedness_paths,
              homophone_unigram_paths,
              puns_data_path,
              self_relatedness=13.0):
     self.self_relatedness = self_relatedness
     self.use_trigrams = True
     self.unigram = {}
     self.trigram = {}
     self.homo_unigram = {}
     self.relatedness = {}
     self.word_vector = {}
     self.get_idx = map_puntypeID_to_idx(data2numpy(puns_data_path))
     self._parse_relatedness(data2numpy(relatedness_paths[0]), 'near')
     self._parse_relatedness(data2numpy(relatedness_paths[1]), 'identical')
     self._parse_unigrams(data2numpy(homophone_unigram_paths[0]), 'near')
     self._parse_unigrams(data2numpy(homophone_unigram_paths[1]),
                          'identical')
     print('loaded data model v1')
コード例 #3
0
ファイル: models.py プロジェクト: ml-lab/pun-model
 def _parse_relatedness(self, path):
     print('parsing relatedness data...')
     data = data2numpy(path)
     for row in data:
         words = [row[0].lower(), row[1].lower()]
         words.sort()
         words = tuple(words)
         relatedness = float(row[2])
         self.relatedness[words] = relatedness
     print('done')
     return
コード例 #4
0
ファイル: models.py プロジェクト: ml-lab/pun-model
 def _parse_trigrams(self, path):
     print('parsing trigram data...')
     data = data2numpy(path)
     for row in data:
         idx = int(row[0])
         word = row[1].lower()
         m1_trigram = float(row[2])
         m2_trigram = float(row[3])
         # save trigram data
         if idx not in self.trigram:
             self.trigram[idx] = {}
             self.word_vector[idx] = []
             self.trigram[idx]['m1'] = []
             self.trigram[idx]['m2'] = []
         self.trigram[idx]['m1'].append(m1_trigram)
         self.trigram[idx]['m2'].append(m2_trigram)
         self.word_vector[idx].append(word)
     print('done')
     return
コード例 #5
0
#!/usr/bin/env python3
from utils import data2numpy

near_unigram_datapath = './homophones_unigram_near.csv'
identical_unigram_datapath = './homophones_unigram_identical.csv'

# get the whole homophone unigram data in a single list
unigram_data = []
near_unigram_data = data2numpy(near_unigram_datapath)
for row in near_unigram_data:
    unigram_data.append(row)
data = data2numpy(identical_unigram_datapath)
for row in data:
    unigram_data.append(row)

# make word to unigram mapping
# ideally, it should be a one to one mapping
unigram = {}
for row in unigram_data:
    w1 = row[2].lower()
    w2 = row[3].lower()
    # unigram probabilities
    p_w1 = float(row[4])
    p_w2 = float(row[5])

    if w1 not in unigram:
        unigram[w1] = []
    if w2 not in unigram:
        unigram[w2] = []

    # log mutliple unigram values for a single key, if any
コード例 #6
0
ファイル: merge_results.py プロジェクト: ml-lab/pun-model
"""
This script merges two model output files for near and identical type puns
which are computed by author's original script 'computeMeasures.py'. Measures
are saved in a new csv file.
"""
from utils import data2numpy, map_puntypeID_to_idx, save_results

if __name__ == '__main__':
    jokes = data2numpy("../../data/data-agg.csv")
    get_idx = map_puntypeID_to_idx(jokes)
    jokes_data = {}

    # fill dictionary with model output on identical and near puns
    jokes_data = save_results('../ModelOutputs/near_trigram_13_0.csv',
                              jokes_data, get_idx)
    jokes_data = save_results('../ModelOutputs/identical_trigram_13_0.csv',
                              jokes_data, get_idx)

    # write model outputs in a new csv file
    output_path = "../ModelOutputs/data.csv"
    f_v0 = open(output_path, "wr")
    f_v0.write(
        "idx,sentenceID,punType,sentenceType,ambiguity,distinctiveness," +
        "sentence\n")

    # parse measures from original data sheet pun-model/data/data-agg.csv and
    # save it in the same format as data.csv for comparing model results
    f_original = open("../ModelOutputs/data-agg-measures.csv", "wr")
    f_original.write("idx,sentenceID,punType,sentenceType,ambiguity," +
                     "distinctiveness,sentence\n")
    for idx in range(len(jokes_data)):
コード例 #7
0
"""
Fits a linear regression model over the computed measures - ambiguity and
distinctiveness. Groundtruth funniness ratings are obtained from
pun-model/data/data-agg.csv. Regression model summary is printed on console.
"""

import statsmodels.api as sm
import numpy as np
from utils import data2numpy

if __name__ == '__main__':
    # read groundtruth funniness ratings
    jokes_data_path = "../../data/data-agg.csv"
    jokes_data = data2numpy(jokes_data_path)
    gt = np.array(jokes_data[:, -1]).astype(float)

    # read ambiguity and distinctiveness measures computed by model
    model_output = data2numpy("../results/data.csv")
    amb = np.array(model_output[:, 4]).astype(float)
    dist = np.array(model_output[:, 5]).astype(float)

    # fit linear regression model
    X = np.stack([amb, dist]).transpose()
    y = gt
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())
コード例 #8
0
"""
Fits a linear regression model over the computed measures - ambiguity and
distinctiveness. Groundtruth funniness ratings are obtained from
pun-model/data/data-agg.csv. Regression model summary is printed on console.
"""

import statsmodels.api as sm
import numpy as np
from utils import data2numpy

if __name__ == '__main__':
    # read ground truth funniness ratings
    jokes_data_path = "../../data/data-agg.csv"
    jokes_data = data2numpy(jokes_data_path)
    gt = np.array(jokes_data[:, -1]).astype(float)

    # read ambiguity and distinctiveness measures computed by model
    model_output = data2numpy("../ModelOutputs/data.csv")
    amb = np.array(model_output[:, 4]).astype(float)
    dist = np.array(model_output[:, 5]).astype(float)

    # fit linear regression model
    X = np.stack([amb, dist]).transpose()
    y = gt
    model = sm.OLS(y, X)
    results = model.fit()
    print(results.summary())
コード例 #9
0
#!/usr/bin/env python3
from utils import data2numpy, map_puntypeID_to_idx

relatedness_near_datapath = ("./wordPair_relatedness_" +
                             "smoothedTrigrams_near.csv")
relatedness_identical_datapath = ("./wordPair_relatedness_" +
                                  "smoothedTrigrams_identical.csv")

# get m1 and m2 (two meanings/interpretations) for each sentence in the dataset
meanings = {}
puns_datapath = '../../data/data-agg.csv'
puns_data = data2numpy(puns_datapath)
for i, row in enumerate(puns_data):
    meanings[i] = [row[-3], row[-2]]

# map puntype (near/identical) and puntypeID to index in puns dataset
# for example, m['near'][1] will yield index of the pun with 'near' homophone
# and having 'near' sentence ID = 1 in puns dataset [data-agg.csv]
get_idx = map_puntypeID_to_idx(data2numpy(puns_datapath))

# clean up relatedness data
relatedness = {}
data = data2numpy(relatedness_near_datapath)
for row in data:
    id = int(row[0])
    idx = get_idx['near'][id]
    m1 = meanings[idx][0]
    m2 = meanings[idx][1]
    word = row[3]
    m1_relatedness = float(row[4])
    m2_relatedness = float(row[5])