예제 #1
0
def get_goog_match_by_assign(path):
        matches = get_goog_translations()
        words = word_map()
        data = {}
	numlangmap, langmap = dat.lang_map()
	hitlangs = dat.hits_language()
	hits = dictionaries.hit_map()
        for line in csv.DictReader(open(path)):
                assign = line['assignment_id']
                translation = line['translation']
                word_id = line['word_id']
                if(assign not in data):
                        data[assign] = {'total': 'N/A', 'syns': 'N/A'}
		assign_lang = numlangmap[hitlangs[hits[assign]]]
		if(assign_lang not in goog_langs):
			print "Skipping lang",  assign_lang, "not supported by google"
			continue
                if word_id in matches: 
			if data[assign]['total'] == 'N/A': 
				data[assign] = {'total': 0, 'syns': 0} 
			if translation.strip().lower() == matches[word_id].strip().lower(): 
				data[assign]['syns'] += 1
                        data[assign]['total'] += 1
#                else:
 #                       print 'Could not find', assign, word_id, 'in google dictionary. Skipping'
        ret = dict()
        for a in data:
                if data[a]['total'] == 0:
                        ret[a] = (data[a]['total'], data[a]['syns'], 0)
                elif data[a]['total'] == 'N/A':
			print a, "is N/A"
                        ret[a] = (data[a]['total'], data[a]['syns'], 'N/A')
                else:
                        ret[a] = (data[a]['total'],data[a]['syns'],float(data[a]['syns'])/data[a]['total'])
        return ret
예제 #2
0
def get_language_dicts_quals(path,
                             qual_cutoff=None,
                             strict=False,
                             filter_list=None):
    all_dicts = dict()
    hitmap = hit_map()
    langids, langcodes = dat.lang_map()
    hitlangs = dat.hits_language()
    quals = qual_map()
    for assign in open(path).readlines():
        comps = assign.strip().split('\t')
        aid = comps[0]
        if (aid == ''):
            continue
        if (not (filter_list == None) and (aid not in filter_list)):
            continue
        if (not (qual_cutoff == None) and strict
                and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)):
            print 'below cutoff', aid, quals[aid]
            continue
        if (not (qual_cutoff == None) and not (strict)
                and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)):
            print 'below cutoff', aid, quals[aid]
            continue
        alang = langids[hitlangs[hitmap[aid]]]
        if not (alang in all_dicts):
            all_dicts[alang] = {'num': 0, 'denom': 0}
        all_dicts[alang]['num'] += quals[aid]
        all_dicts[alang]['denom'] += 1
    return all_dicts
예제 #3
0
def get_language_dicts_quals(path, qual_cutoff=None, strict=False, filter_list=None):
	all_dicts = dict()
	hitmap = hit_map()
	langids, langcodes = dat.lang_map()
        hitlangs = dat.hits_language()
	quals = qual_map()
	for assign in open(path).readlines():
		comps = assign.strip().split('\t')
		aid = comps[0]
		if(aid == ''):
			continue
		if(not(filter_list == None) and (aid not in filter_list)):
			continue
		if(not(qual_cutoff == None) and strict and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)):
			print 'below cutoff', aid, quals[aid]
			continue
		if(not(qual_cutoff == None) and not(strict) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)):
			print 'below cutoff', aid, quals[aid]
			continue
		alang =  langids[hitlangs[hitmap[aid]]]
		if not(alang in all_dicts):
			all_dicts[alang] = {'num':0, 'denom':0}
		all_dicts[alang]['num'] += quals[aid]
		all_dicts[alang]['denom'] += 1
	return all_dicts
예제 #4
0
def get_language_dicts_quals_turkers(path, qual_cutoff=None, strict=False, filter_list=None, match=True):
	all_dicts = dict()
	tmap = turker_map()
	hitmap = hit_map()
	langids, langcodes = dat.lang_map()
        hitlangs = dat.hits_language()
	quals = qual_map()
	tquals = turker_qual_map()
	for assign in open(path).readlines():
		comps = assign.strip().split('\t')
		aid = comps[0]
		if(aid == '' or aid not in tmap):
			continue
		#if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip
		if(match):
			list_pass = not(filter_list == None) and (aid not in filter_list)
		else:
			list_pass = not(filter_list == None) and (aid in filter_list)
		if(list_pass):
			continue
		#if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff
		if(strict):
			qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)
		else:
			qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)
		if(qual_pass):
			print 'below cutoff', aid, quals[aid]
			continue
		alang =  langids[hitlangs[hitmap[aid]]]
		if not(alang in all_dicts):
			all_dicts[alang] = {'num':0, 'denom':0}
		all_dicts[alang]['num'] += tquals[tmap[aid]]
		all_dicts[alang]['denom'] += 1
	return all_dicts
예제 #5
0
def get_language_dicts_quals_turkers(path, qual_cutoff=None, strict=False, filter_list=None, match=True):
	all_dicts = dict()
	tmap = turker_map()
	hitmap = hit_map()
	langids, langcodes = dat.lang_map()
        hitlangs = dat.hits_language()
	quals = qual_map()
	tquals = turker_qual_map()
	for assign in open(path).readlines():
		comps = assign.strip().split('\t')
		aid = comps[0]
		if(aid == '' or aid not in tmap):
			continue
		#if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip
		if(match):
			list_pass = not(filter_list == None) and (aid not in filter_list)
		else:
			list_pass = not(filter_list == None) and (aid in filter_list)
		if(list_pass):
			continue
		#if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff
		if(strict):
			qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)
		else:
			qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)
		if(qual_pass):
			print 'below cutoff', aid, quals[aid]
			continue
		alang =  langids[hitlangs[hitmap[aid]]]
		if not(alang in all_dicts):
			all_dicts[alang] = {'num':0, 'denom':0}
		all_dicts[alang]['num'] += tquals[tmap[aid]]
		all_dicts[alang]['denom'] += 1
	return all_dicts
예제 #6
0
def get_language_dicts(path,
                       qual_cutoff=None,
                       strict=False,
                       filter_list=None,
                       match=True):
    all_dicts = dict()
    tmap = turker_map()
    hitmap = hit_map()
    langids, langcodes = dat.lang_map()
    hitlangs = dat.hits_language()
    quals = qual_map()
    tquals = turker_qual_map()
    filtered = 0
    for cnt, assign in enumerate(open(path).readlines()):
        comps = assign.strip().split('\t')
        aid = comps[0]
        if cnt % 1000 == 0:
            print aid
        if (aid == ''):  # or aid not in tmap):
            continue
        #if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip
        if (match):
            list_pass = not (filter_list == None) and (aid not in filter_list)
        else:
            list_pass = not (filter_list == None) and (aid in filter_list)
        if (list_pass):
            filtered += 1
            continue
        #if strict, assignments must be greater than qual_cutoff to pass, otherwise, can be greater or equal to qual_cutoff
        if (strict):
            qual_pass = not (qual_cutoff == None) and (
                quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)
        else:
            qual_pass = not (qual_cutoff == None) and (
                quals[aid] == 'N/A' or quals[aid] < qual_cutoff)
        if (qual_pass):
            print 'below cutoff', aid, quals[aid]
            continue
        alang = langids[hitlangs[hitmap[aid]]]
        if not alang in all_dicts:
            all_dicts[alang] = dict()
        for pair in comps[1:]:
            try:
                word = pair[:pair.index(':')]
                trans = pair[pair.index(':') + 1:]
            except ValueError:
                print word
                continue
            if not (trans.strip() == ''):
                if not word in all_dicts[alang]:
                    all_dicts[alang][word] = [trans.strip().lower()]
                else:
                    all_dicts[alang][word].append(trans.strip().lower())
    print 'FILTERED %d ASSIGNMENTS' % filtered
    return all_dicts
예제 #7
0
def get_language_dicts(path, qual_cutoff=None, strict=False, filter_list=None, match=True):
	all_dicts = dict()
	tmap = turker_map()
	hitmap = hit_map()
	langids, langcodes = dat.lang_map()
        hitlangs = dat.hits_language()
	quals = qual_map()
	tquals = turker_qual_map()
	filtered = 0
	for cnt, assign in enumerate(open(path).readlines()):
		comps = assign.strip().split('\t')
		aid = comps[0]
		if cnt%1000 == 0:
			print aid
		if(aid == ''): # or aid not in tmap):
			continue
		#if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip
		if(match):
			list_pass = not(filter_list == None) and (aid not in filter_list)
		else:
			list_pass = not(filter_list == None) and (aid in filter_list)
		if(list_pass):
			filtered += 1
			continue
		#if strict, assignments must be greater than qual_cutoff to pass, otherwise, can be greater or equal to qual_cutoff
		if(strict):
			qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)
		else:
			qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)
		if(qual_pass):
			print 'below cutoff', aid, quals[aid]
			continue
		alang =  langids[hitlangs[hitmap[aid]]]
		if not alang in all_dicts:
			all_dicts[alang] = dict()
		for pair in comps[1:]:
			try:
				word = pair[:pair.index(':')]
				trans = pair[pair.index(':')+1:]
			except ValueError:
				print word
				continue
			if not(trans.strip() == ''):
				if not word in all_dicts[alang]:
					all_dicts[alang][word] = [trans.strip().lower()]
				else:
					all_dicts[alang][word].append(trans.strip().lower())
	print 'FILTERED %d ASSIGNMENTS'%filtered
	return all_dicts
예제 #8
0
def get_language_dicts(path, filter_list=None):
    all_dicts = dict()
    hitmap = dictionaries.hit_map()
    langids, langcodes = dat.lang_map()
    hitlangs = dat.hits_language()
    timemap = time_map()
    for assign in csv.DictReader(open(path), delimiter='\t'):
        aid = assign['id']
        if (aid == ''):
            continue
        if (not (filter_list == None) and (aid not in filter_list)):
            continue
        alang = langids[hitlangs[hitmap[aid]]]
        if not alang in all_dicts:
            all_dicts[alang] = timemap[aid]
        tstart, tend = timemap[aid]
        all_dicts[alang] = (min(all_dicts[alang][0],
                                tstart), max(all_dicts[alang][1], tend))
    return all_dicts
예제 #9
0
def format_for_time_series(data):
    print "More formatting data"
    hitmap = dictionaries.hit_map()
    langids, langcodes = dat.lang_map()
    hitlangs = dat.hits_language()
    all_times = dict()
    for aid, complete, start, total in data:
        lang = langids[hitlangs[hitmap[aid]]]
        if lang not in all_times:
            all_times[lang] = list()
        if lang == 'ru' or lang == 'ur':
            all_times[lang].append(total + datetime.timedelta(days=2))
        else:
            all_times[lang].append(total)
    try:
        all_times.pop('en')
    except KeyError:
        pass
    return all_times
예제 #10
0
def count_turkers_verbose(path,
                          qual_cutoff=None,
                          strict=False,
                          filter_list=None,
                          match=True):
    all_dicts = dict()
    hitmap = hit_map()
    langids, langcodes = dat.lang_map()
    hitlangs = dat.hits_language()
    tmap = turker_map()
    quals = qual_map()
    assignmap = assign_dict()
    for assign in open(path).readlines():
        comps = assign.strip().split('\t')
        aid = comps[0]
        if aid == '' or aid not in assignmap:
            continue
#if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip
        if (match):
            list_pass = not (filter_list == None) and (aid not in filter_list)
        else:
            list_pass = not (filter_list == None) and (aid in filter_list)
        if (list_pass):
            continue
#if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff
        if (strict):
            qual_pass = not (qual_cutoff == None) and (
                quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)
        else:
            qual_pass = not (qual_cutoff == None) and (
                quals[aid] == 'N/A' or quals[aid] < qual_cutoff)
        if qual_pass:
            print 'below cutoff', aid, quals[aid]
            continue
        alang = langids[hitlangs[hitmap[aid]]]
        country = assignmap[aid]['country']
        if not (alang in all_dicts):
            all_dicts[alang] = dict()
        if country not in all_dicts[alang]:
            all_dicts[alang][country] = list()
        all_dicts[alang][country].append(tmap[aid])
    return all_dicts
예제 #11
0
def get_goog_match_by_assign(path):
    matches = get_goog_translations()
    pickle.dump(matches, open('googmatch-old.pkl', 'w'))
    exit(0)
    words = word_map()
    data = {}
    numlangmap, langmap = dat.lang_map()
    hitlangs = dat.hits_language()
    hits = dictionaries.hit_map()
    for line in csv.DictReader(open(path)):
        assign = line['assignment_id']
        translation = line['translation']
        word_id = line['word_id']
        if (assign not in data):
            data[assign] = {'total': 'N/A', 'syns': 'N/A'}
        assign_lang = numlangmap[hitlangs[hits[assign]]]
        if (assign_lang not in goog_langs):
            print "Skipping lang", assign_lang, "not supported by google"
            continue
        if word_id in matches:
            if data[assign]['total'] == 'N/A':
                data[assign] = {'total': 0, 'syns': 0}
            if translation.strip().lower() == matches[word_id].strip().lower():
                data[assign]['syns'] += 1
            data[assign]['total'] += 1
#                else:
#                       print 'Could not find', assign, word_id, 'in google dictionary. Skipping'
    ret = dict()
    for a in data:
        if data[a]['total'] == 0:
            ret[a] = (data[a]['total'], data[a]['syns'], 0)
        elif data[a]['total'] == 'N/A':
            print a, "is N/A"
            ret[a] = (data[a]['total'], data[a]['syns'], 'N/A')
        else:
            ret[a] = (data[a]['total'], data[a]['syns'],
                      float(data[a]['syns']) / data[a]['total'])
    return ret
예제 #12
0
def count_turkers_verbose(path, qual_cutoff=None, strict=False, filter_list=None, match=True):
	all_dicts = dict()
	hitmap = hit_map()
	langids, langcodes = dat.lang_map()
        hitlangs = dat.hits_language()
	tmap = turker_map()
	quals = qual_map()
	assignmap = assign_dict()
	for assign in open(path).readlines():
		comps = assign.strip().split('\t')
		aid = comps[0]
		if aid == '' or aid not in assignmap:
			continue
  		#if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip
                if(match):
                        list_pass = not(filter_list == None) and (aid not in filter_list)
                else:
                        list_pass = not(filter_list == None) and (aid in filter_list)
                if(list_pass):
                        continue
                #if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff
                if(strict):
                        qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)
                else:
                        qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)
		if qual_pass:
			print 'below cutoff', aid, quals[aid]
			continue
		alang = langids[hitlangs[hitmap[aid]]]
		country = assignmap[aid]['country']
		if not(alang in all_dicts):
			all_dicts[alang] = dict()
		if country not in all_dicts[alang]:
			all_dicts[alang][country] = list()
		all_dicts[alang][country].append(tmap[aid])
	return all_dicts