def FrequentWords(data_dirs, suffixes, max_key_words): """ Returns a dictionary of min(max_key_words, percentile_key_words), giving key word with its count. """ matches = matchingFiles(data_dirs, suffixes) token_count = Counter() files_done = 0 for file_name in matches: tokens = tokenize(file_name) for token in tokens: if len(token) == 0: continue try: token_count[token] += 1 except: token_count[token] = 1 files_done += 1 if (files_done % 5000 == 0): print("Completed parsing %d files ..." % files_done) # num_key_words = min(max_key_words, # math.ceil(percentile_key_words * len(token_count))) return token_count.most_common(max_key_words)
if args.command == 'test': model = EnsembleModel(keywords, winSize=args.win) else: # model = RnnAttentionDense2(keywords, winSize=args.win, # wdim=args.dim, zdim=args.zdim, zdim2=args.zdim2, # reg=args.reg, # load_from_file=False) model = RnnLSTM(keywords, load_from_file=False, winSize=10) if args.restore is not None: model.restoreFrom(args.restore) print('Restored model from %s' % args.restore) data_dir = os.path.join('../data', args.project) files = utils.matchingFiles([data_dir], args.langs) filesAndTokens = [] # choose the first half of files based on a deterministic random range robj = random.Random(12345) robj.shuffle(files) if args.command == 'train': fileSubset = files[:int(len(files) / 2)] elif args.command == 'test': fileSubset = files[int(len(files) / 2):] else: fileSubset = files[int(len(files) / 2):] if args.command == 'train' or args.command == 'test': for i, name in enumerate(fileSubset):
print keywords if args.command == 'test': model = EnsembleModel(keywords, winSize=args.win) else: model = RnnAttentionDense2(keywords, winSize=args.win, wdim=args.dim, zdim=args.zdim, zdim2=args.zdim2, reg=args.reg, load_from_file=False) if args.restore is not None: model.restoreFrom(args.restore) print 'Restored model from %s' % args.restore data_dir = os.path.join('../data', args.project) files = utils.matchingFiles([data_dir], args.langs) filesAndTokens = [] # choose the first half of files based on a deterministic random range robj = random.Random(12345) robj.shuffle(files) if args.command == 'train': fileSubset = files[:len(files)/2] elif args.command == 'test': fileSubset = files[len(files)/2:] else: fileSubset = files[len(files)/2:] if args.command == 'train' or args.command == 'test': for i,name in enumerate(fileSubset):