def get_goog_match_by_assign(path): matches = get_goog_translations() words = word_map() data = {} numlangmap, langmap = dat.lang_map() hitlangs = dat.hits_language() hits = dictionaries.hit_map() for line in csv.DictReader(open(path)): assign = line['assignment_id'] translation = line['translation'] word_id = line['word_id'] if(assign not in data): data[assign] = {'total': 'N/A', 'syns': 'N/A'} assign_lang = numlangmap[hitlangs[hits[assign]]] if(assign_lang not in goog_langs): print "Skipping lang", assign_lang, "not supported by google" continue if word_id in matches: if data[assign]['total'] == 'N/A': data[assign] = {'total': 0, 'syns': 0} if translation.strip().lower() == matches[word_id].strip().lower(): data[assign]['syns'] += 1 data[assign]['total'] += 1 # else: # print 'Could not find', assign, word_id, 'in google dictionary. Skipping' ret = dict() for a in data: if data[a]['total'] == 0: ret[a] = (data[a]['total'], data[a]['syns'], 0) elif data[a]['total'] == 'N/A': print a, "is N/A" ret[a] = (data[a]['total'], data[a]['syns'], 'N/A') else: ret[a] = (data[a]['total'],data[a]['syns'],float(data[a]['syns'])/data[a]['total']) return ret
def get_language_dicts_quals(path, qual_cutoff=None, strict=False, filter_list=None): all_dicts = dict() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() quals = qual_map() for assign in open(path).readlines(): comps = assign.strip().split('\t') aid = comps[0] if (aid == ''): continue if (not (filter_list == None) and (aid not in filter_list)): continue if (not (qual_cutoff == None) and strict and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)): print 'below cutoff', aid, quals[aid] continue if (not (qual_cutoff == None) and not (strict) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)): print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] if not (alang in all_dicts): all_dicts[alang] = {'num': 0, 'denom': 0} all_dicts[alang]['num'] += quals[aid] all_dicts[alang]['denom'] += 1 return all_dicts
def get_language_dicts_quals(path, qual_cutoff=None, strict=False, filter_list=None): all_dicts = dict() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() quals = qual_map() for assign in open(path).readlines(): comps = assign.strip().split('\t') aid = comps[0] if(aid == ''): continue if(not(filter_list == None) and (aid not in filter_list)): continue if(not(qual_cutoff == None) and strict and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff)): print 'below cutoff', aid, quals[aid] continue if(not(qual_cutoff == None) and not(strict) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff)): print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] if not(alang in all_dicts): all_dicts[alang] = {'num':0, 'denom':0} all_dicts[alang]['num'] += quals[aid] all_dicts[alang]['denom'] += 1 return all_dicts
def get_language_dicts_quals_turkers(path, qual_cutoff=None, strict=False, filter_list=None, match=True): all_dicts = dict() tmap = turker_map() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() quals = qual_map() tquals = turker_qual_map() for assign in open(path).readlines(): comps = assign.strip().split('\t') aid = comps[0] if(aid == '' or aid not in tmap): continue #if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip if(match): list_pass = not(filter_list == None) and (aid not in filter_list) else: list_pass = not(filter_list == None) and (aid in filter_list) if(list_pass): continue #if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff if(strict): qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff) else: qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff) if(qual_pass): print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] if not(alang in all_dicts): all_dicts[alang] = {'num':0, 'denom':0} all_dicts[alang]['num'] += tquals[tmap[aid]] all_dicts[alang]['denom'] += 1 return all_dicts
def get_language_dicts(path, qual_cutoff=None, strict=False, filter_list=None, match=True): all_dicts = dict() tmap = turker_map() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() quals = qual_map() tquals = turker_qual_map() filtered = 0 for cnt, assign in enumerate(open(path).readlines()): comps = assign.strip().split('\t') aid = comps[0] if cnt % 1000 == 0: print aid if (aid == ''): # or aid not in tmap): continue #if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip if (match): list_pass = not (filter_list == None) and (aid not in filter_list) else: list_pass = not (filter_list == None) and (aid in filter_list) if (list_pass): filtered += 1 continue #if strict, assignments must be greater than qual_cutoff to pass, otherwise, can be greater or equal to qual_cutoff if (strict): qual_pass = not (qual_cutoff == None) and ( quals[aid] == 'N/A' or quals[aid] <= qual_cutoff) else: qual_pass = not (qual_cutoff == None) and ( quals[aid] == 'N/A' or quals[aid] < qual_cutoff) if (qual_pass): print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] if not alang in all_dicts: all_dicts[alang] = dict() for pair in comps[1:]: try: word = pair[:pair.index(':')] trans = pair[pair.index(':') + 1:] except ValueError: print word continue if not (trans.strip() == ''): if not word in all_dicts[alang]: all_dicts[alang][word] = [trans.strip().lower()] else: all_dicts[alang][word].append(trans.strip().lower()) print 'FILTERED %d ASSIGNMENTS' % filtered return all_dicts
def get_language_dicts(path, qual_cutoff=None, strict=False, filter_list=None, match=True): all_dicts = dict() tmap = turker_map() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() quals = qual_map() tquals = turker_qual_map() filtered = 0 for cnt, assign in enumerate(open(path).readlines()): comps = assign.strip().split('\t') aid = comps[0] if cnt%1000 == 0: print aid if(aid == ''): # or aid not in tmap): continue #if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip if(match): list_pass = not(filter_list == None) and (aid not in filter_list) else: list_pass = not(filter_list == None) and (aid in filter_list) if(list_pass): filtered += 1 continue #if strict, assignments must be greater than qual_cutoff to pass, otherwise, can be greater or equal to qual_cutoff if(strict): qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff) else: qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff) if(qual_pass): print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] if not alang in all_dicts: all_dicts[alang] = dict() for pair in comps[1:]: try: word = pair[:pair.index(':')] trans = pair[pair.index(':')+1:] except ValueError: print word continue if not(trans.strip() == ''): if not word in all_dicts[alang]: all_dicts[alang][word] = [trans.strip().lower()] else: all_dicts[alang][word].append(trans.strip().lower()) print 'FILTERED %d ASSIGNMENTS'%filtered return all_dicts
def get_language_dicts(path, filter_list=None): all_dicts = dict() hitmap = dictionaries.hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() timemap = time_map() for assign in csv.DictReader(open(path), delimiter='\t'): aid = assign['id'] if (aid == ''): continue if (not (filter_list == None) and (aid not in filter_list)): continue alang = langids[hitlangs[hitmap[aid]]] if not alang in all_dicts: all_dicts[alang] = timemap[aid] tstart, tend = timemap[aid] all_dicts[alang] = (min(all_dicts[alang][0], tstart), max(all_dicts[alang][1], tend)) return all_dicts
def format_for_time_series(data): print "More formatting data" hitmap = dictionaries.hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() all_times = dict() for aid, complete, start, total in data: lang = langids[hitlangs[hitmap[aid]]] if lang not in all_times: all_times[lang] = list() if lang == 'ru' or lang == 'ur': all_times[lang].append(total + datetime.timedelta(days=2)) else: all_times[lang].append(total) try: all_times.pop('en') except KeyError: pass return all_times
def count_turkers_verbose(path, qual_cutoff=None, strict=False, filter_list=None, match=True): all_dicts = dict() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() tmap = turker_map() quals = qual_map() assignmap = assign_dict() for assign in open(path).readlines(): comps = assign.strip().split('\t') aid = comps[0] if aid == '' or aid not in assignmap: continue #if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip if (match): list_pass = not (filter_list == None) and (aid not in filter_list) else: list_pass = not (filter_list == None) and (aid in filter_list) if (list_pass): continue #if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff if (strict): qual_pass = not (qual_cutoff == None) and ( quals[aid] == 'N/A' or quals[aid] <= qual_cutoff) else: qual_pass = not (qual_cutoff == None) and ( quals[aid] == 'N/A' or quals[aid] < qual_cutoff) if qual_pass: print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] country = assignmap[aid]['country'] if not (alang in all_dicts): all_dicts[alang] = dict() if country not in all_dicts[alang]: all_dicts[alang][country] = list() all_dicts[alang][country].append(tmap[aid]) return all_dicts
def get_goog_match_by_assign(path): matches = get_goog_translations() pickle.dump(matches, open('googmatch-old.pkl', 'w')) exit(0) words = word_map() data = {} numlangmap, langmap = dat.lang_map() hitlangs = dat.hits_language() hits = dictionaries.hit_map() for line in csv.DictReader(open(path)): assign = line['assignment_id'] translation = line['translation'] word_id = line['word_id'] if (assign not in data): data[assign] = {'total': 'N/A', 'syns': 'N/A'} assign_lang = numlangmap[hitlangs[hits[assign]]] if (assign_lang not in goog_langs): print "Skipping lang", assign_lang, "not supported by google" continue if word_id in matches: if data[assign]['total'] == 'N/A': data[assign] = {'total': 0, 'syns': 0} if translation.strip().lower() == matches[word_id].strip().lower(): data[assign]['syns'] += 1 data[assign]['total'] += 1 # else: # print 'Could not find', assign, word_id, 'in google dictionary. Skipping' ret = dict() for a in data: if data[a]['total'] == 0: ret[a] = (data[a]['total'], data[a]['syns'], 0) elif data[a]['total'] == 'N/A': print a, "is N/A" ret[a] = (data[a]['total'], data[a]['syns'], 'N/A') else: ret[a] = (data[a]['total'], data[a]['syns'], float(data[a]['syns']) / data[a]['total']) return ret
def count_turkers_verbose(path, qual_cutoff=None, strict=False, filter_list=None, match=True): all_dicts = dict() hitmap = hit_map() langids, langcodes = dat.lang_map() hitlangs = dat.hits_language() tmap = turker_map() quals = qual_map() assignmap = assign_dict() for assign in open(path).readlines(): comps = assign.strip().split('\t') aid = comps[0] if aid == '' or aid not in assignmap: continue #if match, filter_list is list of assignments to include, otherwise, filter_list is list of assignments to skip if(match): list_pass = not(filter_list == None) and (aid not in filter_list) else: list_pass = not(filter_list == None) and (aid in filter_list) if(list_pass): continue #if strict, assignments must be greater than qual_cutoff to pass, otherwise, assignments can be greater or equal to qual_cutoff if(strict): qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] <= qual_cutoff) else: qual_pass = not(qual_cutoff == None) and (quals[aid] == 'N/A' or quals[aid] < qual_cutoff) if qual_pass: print 'below cutoff', aid, quals[aid] continue alang = langids[hitlangs[hitmap[aid]]] country = assignmap[aid]['country'] if not(alang in all_dicts): all_dicts[alang] = dict() if country not in all_dicts[alang]: all_dicts[alang][country] = list() all_dicts[alang][country].append(tmap[aid]) return all_dicts