def testDeleteDictEntry(self): today = datetime.date.today() current_time = int(time.time()) u = self.createUser("da_zbur", "enabled", 10) u.use_questions = "yes" u.put() d1 = self.createDictEntry("da_zbur", 2, "lucrative", \ u"profitable, moneymaking, remunerative", "[LOO-kruh-tiv]") d2 = self.createDictEntry("da_zbur", 2, "ferociously(en)", \ u"жестоко, яростно, свирепо, дико, неистово. Ужасно, невыносимо.") l1 = self.createLearnListItem("da_zbur", d1, today, current_time) l2 = self.createLearnListItem("da_zbur", d2, today, current_time) self.createQuestion(l1, today, "da_zbur", d1.word, "profitable, moneymaking", 1, today, 100) self.createQuestion(l2, today, "da_zbur", d2.word, u"лажа", 2, today, 0) deleteDictEntry(u, "lucrative[LOO-kruh-tiv]") self.assertEqual(None, Dictionary.all().\ filter("word =", "lucrative").get()) self.assertEqual(1, LearnList.all().count()) self.assertEqual(1, Question.all().count()) self.assertEqual(1, Dictionary.all().count())
def __init__(self, name, path): super().__init__() self.name = name self.path = path self.dictionary = Dictionary() self.readedWords = [] self.totalWords = 0
def create_dictionary(entries, dataset): dictionary = Dictionary() for ent in entries: if dataset == 'vqd': qs = ent['question'] else: qs = ent['sentence']['sent'] dictionary.tokenize(qs, True) return dictionary
def __init__(self, name, path, type): super().__init__() self.name = name self.path = path self.type = type self.dictionary = Dictionary() self.documents = [] self.totalCountedWords = 0
def create_dictionary(ds): dictionary = Dictionary() entries = [] for group in ['train','test']: with open( dataset[ds][group],'rb') as f: d = pickle.load(f) entries.extend(d) for ent in entries: qs = ent['question'] dictionary.tokenize(qs, True) return dictionary
def __init__(self, **kwargs): file = kwargs.get('file') self.isnms = kwargs.get('isnms') self.trainembd = kwargs.get('trainembd') #6 postion encoded vectors as used by irls self.spatial = True with open(file, 'rb') as f: self.data = pickle.load(f) if self.trainembd: self.dictionary = Dictionary.load_from_file( kwargs.get('dictionaryfile')) if kwargs.get('testrun'): self.data = self.data[:32] self.pool_features_path_coco = kwargs.get('coco_pool_features') self.pool_features_path_genome = kwargs.get('genome_pool_features') self.poolcoco_id_to_index = self._poolcreate_coco_id_to_index( self.pool_features_path_coco) self.poolcoco_id_to_index_gen = self._poolcreate_coco_id_to_index( self.pool_features_path_genome) self.image_features_path_coco = kwargs.get('coco_bottomup') self.coco_id_to_index = self.id_to_index(self.image_features_path_coco) self.image_features_path_genome = kwargs.get('genome_bottomup') self.genome_id_to_index = self.id_to_index( self.image_features_path_genome)
def __init__(self, **kwargs): dataset = kwargs.get('dataset') splitBy = kwargs.get('splitBy') split = kwargs.get('split') data_json = osp.join('cache/prepro', dataset + "_" + splitBy, split + '.json') with open(data_json, 'r') as f: self.data = json.load(f) #only use the questinos having 1 bbox as answer datanew = [] for ent in self.data: gtbox = ent['gtbox'] if len(gtbox[0]) != 0 and len(gtbox) == 1: datanew.append(ent) self.data = datanew dictfile = kwargs.get('dictionaryfile') self.dictionary = Dictionary.load_from_file(dictfile) if kwargs.get('testrun'): self.data = self.data[:32] self.spatial = True self.image_features_path_coco = kwargs.get('coco_bottomup') self.coco_id_to_index = self.id_to_index(self.image_features_path_coco) print("Dataset [{}] loaded....".format(dataset, split)) print("Split [{}] has {} ref exps.".format(split, len(self.data)))
def get(self): parser = reqparse.RequestParser() parser.add_argument('id', required=False) args = parser.parse_args() if args['id']: if Dictionary.query.filter(Dictionary.id == args['id']).count(): dictionaries = Dictionary.query.filter( Dictionary.id == args['id']) js = Dictionary.serialize(dictionaries[0]) else: return {'message': 'Dictionary not found', 'result': {}}, 404 else: dictionaries = Dictionary.query.all() js = [Dictionary.serialize(d) for d in dictionaries] return {'message': 'Success', 'result': js}, 200
def getUsers(): userlist = [] for user in User.all().order("-total_points").run(): c = Dictionary.all().\ filter("twitter_user ="******"username": user.twitter, "points": user.total_points, "wordscount": c}) return json.dumps(userlist)
def __init__(self): i = words_iter() if i: self.words_iterator = i else: words = Dictionary.all().filter("twitter_user ="******"words":words,"pos":0} set_words_iter(self.words_iterator)
def get(self): user = current_user() if not user: self.redirect("/login") else: parameters = {} parameters["total_points"] = User.all().\ filter("twitter =", user.twitter).get().total_points parameters["total_words"] = Dictionary.all().\ filter("twitter_user =", user.twitter).count() self.view(parameters)
class Group: def __init__(self, name, path, type): super().__init__() self.name = name self.path = path self.type = type self.dictionary = Dictionary() self.documents = [] self.totalCountedWords = 0 def readDocuments(self, stopWords=[], headers=[], fastReading=False): self.dictionary.clean() print(f"Start reading group {self.name}, type: {self.type}") bar = defaultProgress(len(self.documents)).start() i = 0 for document in self.documents: document.readWords(stopWords, headers, fastReading) for word in document.dictionary.words: self.dictionary.searchAndAddWord( GroupedWord(word.text, self, word.counted, 1)) document.clearReadedWords() i += 1 bar.update(i) self.setTotalCountedWords() bar.finish() print(f"Done reading group {self.name}") def setTotalCountedWords(self): self.totalCountedWords = 0 for word in self.dictionary.words: self.totalCountedWords += word.counted def __str__(self): return f"Group: {self.name}"
def editDictEntry(user, original_word, new_string): original_word, _ = parseOpt(original_word) dict_entry = Dictionary.all().\ filter("twitter_user ="******"word =", original_word.strip()).get() if dict_entry: parsed_dict = parseMessage(new_string, '') if parsed_dict != {}: dict_entry.word = parsed_dict["word"] dict_entry.meaning = parsed_dict["meaning"] dict_entry.pronounce = parsed_dict["pronounce"] dict_entry.put() return json.dumps({})
def deleteDictEntry(user, word): word, _ = parseOpt(word) dict_entry = Dictionary.all().\ filter("twitter_user ="******"word =", word.strip()).get() if dict_entry: lli = LearnList.all().\ filter("dict_entry =", dict_entry.key()).get() for q in Question.all().filter("lli_ref =", lli.key()).run(): q.delete() lli.delete() dict_entry.delete() return json.dumps({})
def testProcessDuplicateWord(self): json_file = open("files/direct_message1.json") message_json = simplejson.load(json_file) twitter_dm = DirectMessage.NewFromJsonDict(message_json) processMessage(twitter_dm) json_file = open("files/direct_message1.json") message_json = simplejson.load(json_file) twitter_dm = DirectMessage.NewFromJsonDict(message_json) processMessage(twitter_dm) query = Dictionary.all() results = query.fetch(1) self.assertEqual(1, len(results))
def post(self): parser = reqparse.RequestParser() parser.add_argument('name', required=True) args = parser.parse_args() dictionary = Dictionary(args['name']) session.session.add(dictionary) session.session.commit() return { 'message': 'Success', 'result': {} }, 201, { 'Location': '/dictionary/:' + str(dictionary.id) }
def testProcessMessageFromNonExistentUser(self): # Message from user "spammer" who doesn't exist in database # It must not be processed and must not be saved json_file = open("files/direct_message_spammer.json") message_json = simplejson.load(json_file) twitter_dm = DirectMessage.NewFromJsonDict(message_json) processMessage(twitter_dm) query = Dictionary.all() results = query.fetch(1) self.assertEqual(0, len(results)) self.assertEqual("spammer", twitter_dm.sender_screen_name) # Test integration with LearnList query = LearnList.all() ll_results = query.fetch(2) self.assertEqual(0, len(ll_results))
def __init__(self, **kwargs): dataset = kwargs.get('dataset') splitBy = kwargs.get('splitBy') split = kwargs.get('split') data_json = osp.join('cache/prepro', "vqd" + "_" + splitBy, split + '.json') with open(data_json, 'r') as f: self.data = json.load(f) #only use 1 or more gt bos if dataset == 'vqd1': print('VQDv1 1box loaded .......') #only use the questinos having 1 bbox as answer datanew = [] for ent in self.data: gtbox = ent['gtbox'] if len(gtbox[0]) != 0 and len(gtbox) == 1: datanew.append(ent) self.data = datanew dictfile = kwargs.get('dictionaryfile') self.dictionary = Dictionary.load_from_file(dictfile) if kwargs.get('testrun'): self.data = self.data[:32] self.spatial = True self.image_features_path_coco = kwargs.get('vqd_detfeats').format( split) self.coco_id_to_index = self.id_to_index(self.image_features_path_coco) print("Dataset [{}] loaded....".format(dataset, split)) print("Split [{}] has {} ref exps.".format(split, len(self.data))) cocoids = set(self.coco_id_to_index) if kwargs.get('istrain'): cocoids.remove(81768) #some qid doesnot exist datanew = [] for ent in self.data: #some image ids are not in the dataset if ent['image_id'] in cocoids: datanew.append(ent) self.data = datanew
def __init__(self,**kwargs): dataset = kwargs.get('dataset') splitBy = kwargs.get('splitBy') split = kwargs.get('split') data_json = osp.join('cache/prepro', dataset +"_"+ splitBy , split +'.json') with open(data_json,'r') as f: self.data = json.load(f) dictfile = kwargs.get('dictionaryfile') self.dictionary = Dictionary.load_from_file(dictfile) if kwargs.get('testrun'): self.data = self.data[:32] self.spatial = True print ("Dataset [{}] loaded....".format(dataset,split)) print ("Split [{}] has {} ref exps.".format(split,len(self.data)))
class Document: def __init__(self, name, path): super().__init__() self.name = name self.path = path self.dictionary = Dictionary() self.readedWords = [] self.totalWords = 0 def readWords(self, stopWords=[], headers=[], fastReading=False): self.dictionary.clean() file = open(self.path, 'r', encoding="ISO-8859-1") lines = file.readlines() if headers is not None: for line in lines: for header in headers: if line.startswith(header): lines.remove(line) if fastReading is False: vectorizer = CountVectorizer(stop_words=stopWords) x = vectorizer.fit_transform(lines) self.readedWords = vectorizer.get_feature_names() self.totalWords = len(self.readedWords) for arrayLine in x.toarray(): for i in range(0, len(arrayLine)): if arrayLine[i] != 0: self.dictionary.searchAndAddWord( CountedWord(self.readedWords[i], arrayLine[i])) else: words = [] for line in lines: words += line.split() self.totalWords = len(words) for word in words: try: wordInStopList = stopWords.index(word) except (ValueError, AttributeError): self.dictionary.searchAndAddWord(CountedWord(word.lower())) def clearReadedWords(self): self.readedWords = []
def testProcessMessageNormalAddForExistingUser(self): json_file = open("files/direct_message1.json") message_json = simplejson.load(json_file) twitter_dm = DirectMessage.NewFromJsonDict(message_json) processMessage(twitter_dm) query = Dictionary.all() results = query.fetch(1) self.assertEqual(1, len(results)) self.assertEqual("", results[0].pronounce) self.assertEqual("da_zbur", results[0].twitter_user) self.assertEqual(289180663729512448L, results[0].message_id) self.assertEqual("to advet", results[0].word) self.assertEqual(u"обращаться к,ссылаться на",\ results[0].meaning) self.assertEqual(0, results[0].served) self.assertEqual(None, results[0].source_lang) self.assertEqual(1, User.all().filter("twitter =",\ "da_zbur").get().total_points) # Test integration with LearnList query = LearnList.all() ll_results = query.fetch(2) self.assertEqual(1, len(ll_results)) # Check if LearnList references same object self.assertEqual(ll_results[0].dict_entry.key(), results[0].key())
def __init__(self, **kwargs): dataset = kwargs.get('dataset') splitBy = kwargs.get('splitBy') split = kwargs.get('split') data_json = osp.join('cache/prepro', dataset + "_" + splitBy, split + '.json') with open(data_json, 'r') as f: self.data = json.load(f) dictfile = kwargs.get('dictionaryfile') self.dictionary = Dictionary.load_from_file(dictfile) if kwargs.get('testrun'): self.data = self.data[:32] self.spatial = True feats_use = '{}_{}_det_feats.h5'.format(dataset, splitBy) self.image_features_path_coco = osp.join(kwargs.get('refcoco_frcnn'), feats_use) self.coco_id_to_index = self.id_to_index(self.image_features_path_coco) print("Dataset [{}] loaded....".format(dataset, split)) print("Split [{}] has {} ref exps.".format(split, len(self.data)))
def createDictEntry(self, twitter_user, message_id, word, meaning,\ pronounce=""): dictEntry = Dictionary() dictEntry.twitter_user = twitter_user dictEntry.message_id = message_id dictEntry.word = word dictEntry.pronounce = pronounce dictEntry.meaning = meaning dictEntry.served = 0 dictEntry.source_lang = "" dictEntry.target_lang = "" dictEntry.put() return dictEntry
def addNewDictEntry(twitter_user, message_id, entry, served): new_dict_entry = None # No duplicate words allowed for a single user c = Dictionary.all().filter("word =", entry["word"]).\ filter("twitter_user ="******"Count for word %s is %s" % (entry["word"], c)) if c == 0: new_dict_entry = Dictionary() new_dict_entry.pronounce = entry["pronounce"] new_dict_entry.twitter_user = twitter_user new_dict_entry.message_id = message_id new_dict_entry.word = entry["word"] new_dict_entry.meaning = entry["meaning"] new_dict_entry.served = served new_dict_entry.source_lang = entry["source_lang"] new_dict_entry.put() return new_dict_entry
import os.path as osp import json for ds in config.dataset: kwargs = {**config.global_config, **config.dataset[ds]} data_root = kwargs.get('data_root') dataset = kwargs.get('dataset') splitBy = kwargs.get('splitBy') splits = kwargs.get('splits') data = [] for split in splits + ['train']: data_json = osp.join('cache/prepro', dataset + "_" + splitBy, split + '.json') with open(data_json, 'r') as f: d = json.load(f) data.extend(d) d = create_dictionary(data, dataset=dataset) basedir = os.path.dirname(kwargs['dictionaryfile'].format(dataset)) if not os.path.exists(basedir): os.mkdir(basedir) d.dump_to_file(kwargs['dictionaryfile'].format(dataset)) d = Dictionary.load_from_file(kwargs['dictionaryfile'].format(dataset)) emb_dim = 300 glove = 'glove/glove.6B.%dd.txt' % emb_dim embedding_basedir = os.path.dirname(kwargs['glove']) glove_file = embedding_basedir.format(glove) weights, word2emb = create_glove_embedding_init(d.idx2word, glove_file) np.save( os.path.join(embedding_basedir.format(ds), 'glove6b_init_%dd.npy' % emb_dim), weights)
def postMessage(self, user): #print "You are %s " % user.twitter words = Dictionary.all()\ .filter("twitter_user ="******"served < ", user.repeat_times) dict_entry_list = [] message = "" for entry in words: dict_entry_list.append(entry) #If user has enough his own words to fill all slots for the day # If not we need to fill slots with words from people he follows if len(dict_entry_list) < user.messages_per_day: follow_list = user.i_follow.split(",") # for an empty string split() return list with one '' element if follow_list == ['']: follow_list = [] # Let's shuffle the list so we get some variety in users random.shuffle(follow_list) for follow_user in follow_list: f_repeat = 0 for f_user in User.all().filter("twitter =", follow_user): f_repeat = f_user.repeat_times # Getting list of languages user follows follow_lang_list = f_user.follow_lang_list l = [] for lang in follow_lang_list.split(","): l.append("'"+lang+"'") lang_str = "(" + ",".join(l) + ")" words = Dictionary.all()\ .filter("twitter_user ="******"served < ", f_repeat)\ .filter("source_lang IN ", lang_str) for entry in words: dict_entry_list.append(entry) #print "Adding %s from %s" % (entry.word, follow_user) if len(dict_entry_list) >= user.messages_per_day: break #print "You have %d words in your list" % len(dict_entry_list) # If we have any messages to send at all if len(dict_entry_list) > 0: dict_entry = random.sample(dict_entry_list,1)[0] served = dict_entry.served + 1 if dict_entry.pronounce: pronounce = dict_entry.pronounce else: pronounce = "" count = " [%s]" % served # If we are posting message from one of the followed_by list # need to add (via @username) if total message is less than 140 # characters if dict_entry.twitter_user != user.twitter: via = "(via " + dict_entry.twitter_user + ")" else: via = "" if user.default_source_lang != dict_entry.source_lang: lang = " ("+dict_entry.source_lang+")" else: lang = "" message = dict_entry.word+lang+pronounce+": "+dict_entry.meaning+count if len(message+via) < 140: message = message + via if user.message_type == "reply": try: self.twitter.api.PostUpdate("@" + user.twitter + " " + message, in_reply_to_status_id=dict_entry.message_id) #print "You will be sent word %s %s" % (dict_entry.word, via) except TwitterError: logging.error("Twitter error: %s when sending message %s" % (TwitterError.message, "@" + dict_entry.twitter_user+ " " + message)) # Direct message are no longer user #if user.message_type == "direct": # self.twitter.api.PostDirectMessage(dict_entry.twitter_user, message) # We do not change served field for word from other users if via == "": dict_entry.served = dict_entry.served + 1 dict_entry.put() return message
def create_glove_embedding_init(idx2word, glove_file): word2emb = {} with open(glove_file, 'r') as f: entries = f.readlines() emb_dim = len(entries[0].split(' ')) - 1 print('embedding dim is %d' % emb_dim) weights = np.zeros((len(idx2word), emb_dim), dtype=np.float32) for entry in entries: vals = entry.split(' ') word = vals[0] vals = [float(val) for val in vals[1:]] word2emb[word] = np.array(vals) for idx, word in enumerate(idx2word): if word not in word2emb: continue weights[idx] = word2emb[word] return weights, word2emb if __name__ == '__main__': ds = 'Ourdb' d = create_dictionary(ds) d.dump_to_file('data/dictionary.pickle') d = Dictionary.load_from_file('data/dictionary.pickle') emb_dim = 300 glove_file = 'data/glove/glove.6B.%dd.txt' % emb_dim weights, word2emb = create_glove_embedding_init(d.idx2word, glove_file) np.save('data/glove6b_init_%dd.npy' % emb_dim, weights)