def evaluate_crubadan_odin(filename):
    '''
  Trains models from the Crubadan data and runs them on Odin
  '''
    print "Loading character features..."
    trainsetchar = get_features('crubadan', option='char')
    print "Loading word features..."
    trainsetword = get_features('crubadan', option='word')
    print "Loading test data..."
    labels = [x[0] for x in odin.source_sents()]
    test = [(Counter(
        sentence2ngrams(x[1], with_word_boundary=True,
                        option='allgrams')), Counter(x[1].split()))
            for x in odin.source_sents()]

    print "Calculating results..."
    with open(filename, 'w') as f:
        f.write(' '.join(labels) + '\n')
        labels = None
        for lang in sorted(trainsetchar.keys()):
            print lang
            charresult = lang
            wordresult = lang
            modelchar = SGT(trainsetchar.pop(lang))
            modelword = SGT(trainsetword.pop(lang))
            for sentence in test:
                charresult += ' ' + float.hex(modelchar.estimate(sentence[0]))
                wordresult += ' ' + float.hex(modelword.estimate(sentence[1]))
            f.write(charresult + '\n')
            f.write(wordresult + '\n')
    print "Done!"
def evaluate_crubadan_odin(filename):
  '''
  Trains models from the Crubadan data and runs them on Odin
  '''
  print "Loading character features..."
  trainsetchar = get_features('crubadan', option='char')
  print "Loading word features..."
  trainsetword = get_features('crubadan', option='word')
  print "Loading test data..."
  labels = [x[0] for x in odin.source_sents()]
  test = [(Counter(sentence2ngrams(x[1], with_word_boundary=True, option='allgrams')), Counter(x[1].split())) for x in odin.source_sents()]
  
  print "Calculating results..."
  with open(filename,'w') as f:
    f.write(' '.join(labels)+'\n')
    labels = None
    for lang in sorted(trainsetchar.keys()):
      print lang
      charresult = lang
      wordresult = lang
      modelchar = SGT(trainsetchar.pop(lang))
      modelword = SGT(trainsetword.pop(lang))
      for sentence in test:
        charresult += ' ' + float.hex(modelchar.estimate(sentence[0]))
        wordresult += ' ' + float.hex(modelword.estimate(sentence[1]))
      f.write(charresult+'\n')
      f.write(wordresult+'\n')
  print "Done!"
Example #3
0
def tfidfize(data_source, option='3gram'):
  # see http://timtrueman.com/a-quick-foray-into-linear-algebra-and-python-tf-idf/
  # see http://scikit-learn.org/stable/modules/preprocessing.html
  from collections import defaultdict
  import math, os, io
  import cPickle as pickle

  tfidf_pickle = ''.join([data_source,'-',option,'-tfidf','.pk'])

  if os.path.exists(tfidf_pickle):
    with io.open(tfidf_pickle,'rb') as fin:
      featureset = pickle.load(fin)
  else:
    featureset = defaultdict(dict)
    _featureset = get_features(data_source, option=option)
    
    for lang in _featureset:
      for gram in _featureset[lang]:
        tf = _featureset[lang][gram] / float(sum(_featureset[lang].values()))
        idf = math.log(len(_featureset)) / len([i for i in _featureset if gram in _featureset[i]])
        featureset[lang][gram] = tf * idf
        print 'Calculating TF-IDF for %s please wait patiently...' % data_source
        print lang, gram, _featureset[lang][gram], tf, idf, tf * idf
        
    with io.open(tfidf_pickle,'wb') as fout:
      pickle.dump(featureset, fout)      
  return featureset
Example #4
0
def tfidfize(data_source, option='3gram'):
    # see http://timtrueman.com/a-quick-foray-into-linear-algebra-and-python-tf-idf/
    # see http://scikit-learn.org/stable/modules/preprocessing.html
    from collections import defaultdict
    import math, os, io
    import cPickle as pickle

    tfidf_pickle = ''.join([data_source, '-', option, '-tfidf', '.pk'])

    if os.path.exists(tfidf_pickle):
        with io.open(tfidf_pickle, 'rb') as fin:
            featureset = pickle.load(fin)
    else:
        featureset = defaultdict(dict)
        _featureset = get_features(data_source, option=option)

        for lang in _featureset:
            for gram in _featureset[lang]:
                tf = _featureset[lang][gram] / float(
                    sum(_featureset[lang].values()))
                idf = math.log(len(_featureset)) / len(
                    [i for i in _featureset if gram in _featureset[i]])
                featureset[lang][gram] = tf * idf
                print 'Calculating TF-IDF for %s please wait patiently...' % data_source
                print lang, gram, _featureset[lang][gram], tf, idf, tf * idf

        with io.open(tfidf_pickle, 'wb') as fout:
            pickle.dump(featureset, fout)
    return featureset
Example #5
0
def check_data_integrity(data_source="all", remove=True):
  """
  Remove and repickle the extracted feature files and count:
  i.   no. of languages in original source data
  ii.  no. of languages in extracted features  
  """
  import os, glob
  from extractfeature import get_features
  from universalcorpus import odin, omniglot, udhr, wikipedia
  if remove:
    # Remove all/selected pickled files
    toremove = '*.pk' if data_source == "all" else data_source+"*"
    for i in glob.glob(toremove):
      os.remove(i)
    
  # Rebuild pickled files.
  torebuild = ['odin','omniglot','udhr','crubadan','wikipedia'] \
              if data_source == 'all' else [data_source]
  for i in torebuild:
    print "Accessing features from %s, please wait ..." % (i)
    charngrams,wordfreq = get_features(i, option=None, shutup=True)
    print "%s-word.pk contains data for %d Languages." % (i,len(wordfreq))
    print "Original source contains data for %d Languages" % \
          locals()[i].num_languages()
    missing = set(locals()[i].languages()) - set(wordfreq.keys()) 
    print "Thrown languages:",missing, "\n"
Example #6
0
def check_data_integrity(data_source="all", remove=True):
    """
  Remove and repickle the extracted feature files and count:
  i.   no. of languages in original source data
  ii.  no. of languages in extracted features  
  """
    import os, glob
    from extractfeature import get_features
    from universalcorpus import odin, omniglot, udhr, wikipedia
    if remove:
        # Remove all/selected pickled files
        toremove = '*.pk' if data_source == "all" else data_source + "*"
        for i in glob.glob(toremove):
            os.remove(i)

    # Rebuild pickled files.
    torebuild = ['odin','omniglot','udhr','crubadan','wikipedia'] \
                if data_source == 'all' else [data_source]
    for i in torebuild:
        print "Accessing features from %s, please wait ..." % (i)
        charngrams, wordfreq = get_features(i, option=None, shutup=True)
        print "%s-word.pk contains data for %d Languages." % (i, len(wordfreq))
        print "Original source contains data for %d Languages" % \
              locals()[i].num_languages()
        missing = set(locals()[i].languages()) - set(wordfreq.keys())
        print "Thrown languages:", missing, "\n"
Example #7
0
def features2numpy(data_source, option="3gram"):
  featureset = get_features(data_source, option=option)
  all_features = list(set(chain(*[i.keys() for i in featureset.values()])))
  all_tags = [i for i in featureset]
  data, target = [], []
  for lang in featureset:
    data.append([featureset[lang][j] for j in all_features])
    target.append(lang)
    # Sanity check
    ##print [(j,featureset[lang][j]) for j in all_features if featureset[lang][j] > 0]
  return np.array(data), np.array(target), all_features
def classify_odin(sentence, verbose=True):
  '''
  Given an input string, classifies it based on Odin character n-grams.
  Effectively an informal test.
  '''
  test = Counter(sentence2ngrams(sentence, with_word_boundary=True, option='allgrams'))
  trainset = get_features('odin', option='char')
  sgt_results = []
  for lang in trainset:
    train = trainset[lang]
    sgt_results.append((SGT(train, min=6000).estimate(test),lang))
  sgt_results.sort(reverse=True)
  if verbose:
    for i in sgt_results[:10]:
      print i
  return sgt_results
def classify_odin(sentence, verbose=True):
    '''
  Given an input string, classifies it based on Odin character n-grams.
  Effectively an informal test.
  '''
    test = Counter(
        sentence2ngrams(sentence, with_word_boundary=True, option='allgrams'))
    trainset = get_features('odin', option='char')
    sgt_results = []
    for lang in trainset:
        train = trainset[lang]
        sgt_results.append((SGT(train, min=6000).estimate(test), lang))
    sgt_results.sort(reverse=True)
    if verbose:
        for i in sgt_results[:10]:
            print i
    return sgt_results
Example #10
0
def features2numpy(data_source, option="3gram", tfidf=False):
    if tfidf:
        featureset = tfidfize(data_source, option=option)
    else:
        featureset = get_features(data_source, option=option)

    all_features = list(set(chain(*[i.keys() for i in featureset.values()])))
    all_tags = [i for i in featureset]
    data, target = [], []
    for lang in featureset:
        data.append([
            featureset[lang][j] if j in featureset[lang] else 0
            for j in all_features
        ])
        target.append(lang)
        # Sanity check
        ##print [(j,featureset[lang][j]) for j in all_features if featureset[lang][j] > 0]
    return np.array(data), np.array(target), all_features
Example #11
0
def sugarlid_cosine(text, option='3gram', data_source='crubadan'):
  """ Cosine Vector based sugarlid. """
  from cosine import cosine_similarity
  char_ngrams = get_features(data_source, option=option)
  ##for i in char_ngrams:
  ##  print char_ngrams[i]
  #print sentence2ngrams(text, option=option)
  try:
    query_vector = " ".join(sentence2ngrams(text, option=option))
  except TypeError:
    query_vector = " ".join(["_".join(i) for i in \
                             sentence2ngrams(text, option=option)])
    print query_vector
  results = []
  for i in char_ngrams:
    lang_vector = " ".join([str(j+" ")*char_ngrams[i][j] \
                            for j in char_ngrams[i]])
    score = cosine_similarity(query_vector, lang_vector)
    if score > 0:
      results.append((score,i))
  return sorted(results, reverse=True)
Example #12
0
def sugarlid_cosine(text, option='3gram', data_source='crubadan'):
    """ Cosine Vector based sugarlid. """
    from cosine import cosine_similarity
    char_ngrams = get_features(data_source, option=option)
    ##for i in char_ngrams:
    ##  print char_ngrams[i]
    #print sentence2ngrams(text, option=option)
    try:
        query_vector = " ".join(sentence2ngrams(text, option=option))
    except TypeError:
        query_vector = " ".join(["_".join(i) for i in \
                                 sentence2ngrams(text, option=option)])
        print query_vector
    results = []
    for i in char_ngrams:
        lang_vector = " ".join([str(j+" ")*char_ngrams[i][j] \
                                for j in char_ngrams[i]])
        score = cosine_similarity(query_vector, lang_vector)
        if score > 0:
            results.append((score, i))
    return sorted(results, reverse=True)
Example #13
0
train = Counter({'a':1,'b':5,'c':2})
test = Counter({'b':1,'a':1})

langSGT = SGT(train)
langMLE = MLE(train)

print SGTestimate(langSGT,test)
print MLEestimate(langMLE,test)
'''

from extractfeature import sentence2ngrams, get_features
s = "ich bin schwanger"
test = Counter(sentence2ngrams(s, with_word_boundary=True))
print test

trainset = get_features('odin', option='3gram')
sgt_results = []
mle_results = []

'''
german = SGT(trainset['deu'])
wakawaka = SGT(trainset['wkw'])

for x in test:
  print x, trainset['deu'][x], SGTestimate(german, Counter({x:1}))
  print x, trainset['wkw'][x], SGTestimate(wakawaka, Counter({x:1}))

print len(trainset['wkw'])

'''
for lang in trainset: