Exemple #1
0
def FrequentWords(data_dirs, suffixes, max_key_words):
    """
  Returns a dictionary of min(max_key_words, percentile_key_words), giving key
  word with its count.
  """
    matches = matchingFiles(data_dirs, suffixes)

    token_count = Counter()
    files_done = 0
    for file_name in matches:
        tokens = tokenize(file_name)
        for token in tokens:
            if len(token) == 0:
                continue
            try:
                token_count[token] += 1
            except:
                token_count[token] = 1
        files_done += 1
        if (files_done % 5000 == 0):
            print("Completed parsing %d files ..." % files_done)


#  num_key_words = min(max_key_words,
#                      math.ceil(percentile_key_words * len(token_count)))
    return token_count.most_common(max_key_words)
def FrequentWords(data_dirs, suffixes, max_key_words):
  """
  Returns a dictionary of min(max_key_words, percentile_key_words), giving key
  word with its count.
  """
  matches = matchingFiles(data_dirs, suffixes)

  token_count = Counter()
  files_done = 0
  for file_name in matches:
    tokens = tokenize(file_name)
    for token in tokens:
      if len(token) == 0:
        continue
      try:
        token_count[token] += 1
      except:
        token_count[token] = 1
    files_done += 1
    if (files_done % 5000 == 0):
      print("Completed parsing %d files ..." % files_done)

#  num_key_words = min(max_key_words,
#                      math.ceil(percentile_key_words * len(token_count)))
  return token_count.most_common(max_key_words)
    if args.command == 'test':
        model = EnsembleModel(keywords, winSize=args.win)
    else:
        # model = RnnAttentionDense2(keywords, winSize=args.win,
        #                            wdim=args.dim, zdim=args.zdim, zdim2=args.zdim2,
        #                            reg=args.reg,
        #                            load_from_file=False)
        model = RnnLSTM(keywords, load_from_file=False, winSize=10)

    if args.restore is not None:
        model.restoreFrom(args.restore)
        print('Restored model from %s' % args.restore)

    data_dir = os.path.join('../data', args.project)
    files = utils.matchingFiles([data_dir], args.langs)
    filesAndTokens = []

    # choose the first half of files based on a deterministic random range
    robj = random.Random(12345)
    robj.shuffle(files)

    if args.command == 'train':
        fileSubset = files[:int(len(files) / 2)]
    elif args.command == 'test':
        fileSubset = files[int(len(files) / 2):]
    else:
        fileSubset = files[int(len(files) / 2):]

    if args.command == 'train' or args.command == 'test':
        for i, name in enumerate(fileSubset):
    print keywords

    if args.command == 'test':
        model = EnsembleModel(keywords, winSize=args.win)
    else:
        model = RnnAttentionDense2(keywords, winSize=args.win,
                                   wdim=args.dim, zdim=args.zdim, zdim2=args.zdim2,
                                   reg=args.reg,
                                   load_from_file=False)

    if args.restore is not None:
        model.restoreFrom(args.restore)
        print 'Restored model from %s' % args.restore

    data_dir = os.path.join('../data', args.project)
    files = utils.matchingFiles([data_dir], args.langs)
    filesAndTokens = []

    # choose the first half of files based on a deterministic random range
    robj = random.Random(12345)
    robj.shuffle(files)

    if args.command == 'train':
        fileSubset = files[:len(files)/2]
    elif args.command == 'test':
        fileSubset = files[len(files)/2:]
    else:
        fileSubset = files[len(files)/2:]

    if args.command == 'train' or args.command == 'test':
        for i,name in enumerate(fileSubset):