예제 #1
0
def main1():
    setup = [line for line in load_data() if line.startswith('value')]
    instructions = [
        line for line in load_data() if not line.startswith('value')
    ]

    for line in setup:
        text = line.split()
        chip = int(text[1])
        number = text[-1]
        Bot.get_or_create(number).receives(chip)

    while instructions:
        for bot in list(Bot.all_bots.values()):
            if bot.ready:
                sentinel = 'bot {} gives low'.format(bot.number)
                instruction = extract(instructions, sentinel)
                text = instruction.split()
                if text[5] == 'bot':
                    low_recv = Bot.get_or_create(text[6])
                    bot.gives_low_to(low_recv)
                else:
                    # reciever is output
                    position = int(text[6])
                    output[position] = bot.extract_lowest()

                if text[10] == 'bot':
                    high_recv = Bot.get_or_create(text[11])
                    bot.gives_high_to(high_recv)
                else:
                    position = int(text[11])
                    output[position] = bot.extract_high()
예제 #2
0
def test_defects_module(filename, plot):
    # initialize defect module
    train_values    = 5
    train_trees     = 15
    filename_train  = "train_data/defects_acc_data.output"
    init_server.init_defects_module(filename_train, train_values, train_trees)

    # load test data
    test_values = 5
    test_data   = cmn.aver_std_array(cmn.load_data(filename, (3,)), test_values)
    test_data   = test_data.reshape(len(test_data)/2, 2)
    test_times  = cmn.label_array(cmn.load_data(filename, (0,)), test_values)

    # plot results
    if plot == "yes" :
        train_data = cmn.aver_std_array(cmn.load_data(filename_train, (3,)), train_values)
        train_data = train_data.reshape(len(train_data)/2, 2)
        #tmp_data = train_data
        #tmp_predicted =  df.predicted(tmp_data)
        #tmp_data = np.append(tmp_data, tmp_predicted.reshape(len(tmp_predicted), 1), axis=1).reshape(len(tmp_predicted), 3);
        #print tmp_data
        xx, yy = cmn.get_grid(train_data[:, [0, 1]])
        train_predicted = df.predicted(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
        #print "Train data\n", train_data
        #print "Train predicted\n", train_predicted
        test_predicted  = df.predicted(test_data)
        cmn.plot_2D_data(test_data, test_predicted, train_data, train_predicted, [6.8, 12.8], [-0.1, 3.0]);
        #cmn.plot_2D_data(tmp_data, tmp_predicted, train_data, train_predicted, [6.8, 12.8], [-0.1, 3.0]);


    # start finding by time
    df.find_actions(test_data, test_times)
    return
def init_defects_module(filename, values, trees):
    # defects module use default accelerometer.output
    # accelerometer.output: time,accx,accy,accz,label*
    train_data      = cmn.aver_std_array(cmn.load_data(filename, (3,)), values)
    train_data = train_data.reshape(len(train_data)/2, 2)
    train_predicted = cmn.label_array(cmn.load_data(filename, (4,)), values)
    df.init_defects_module(values, trees, train_data, train_predicted)
    return
def init_turns_module(filename, values, trees):
    # turns module use default compass.output
    # compass.output: time,magn,label*
    train_data   = cmn.get_diff_array(cmn.load_data(filename, (1,)))
    train_data   = cmn.aver_std_array(train_data, values)    
    train_data   = train_data.reshape(len(train_data)/2, 2)  
    train_predicted = cmn.label_array(cmn.load_data(filename, (2,)), values)
    tr.init_turns_module(values, trees, train_data, train_predicted)
    return
예제 #5
0
파일: ngram.py 프로젝트: nptit/kaggle
    def main(self):
        t_start = datetime.now()
        logger.info(' {} / {} '.format(self.name, self.random_seed).center(62, '='))
        logger.info('Hyperparameters:\n{}'.format(pprint.pformat(self.params)))
        if os.path.isfile(os.path.join(self.output_dir, 'test.csv')):
            logger.info('Output already exists - skipping')
            return

        # Initialize the random number generator
        self.random_state = RandomState(self.random_seed)
        np.random.seed(int.from_bytes(self.random_state.bytes(4), byteorder=sys.byteorder))

        train_df = common.load_data('train')
        train_df['comment_text'] = train_df['comment_text'].apply(unidecode)
        test_df = common.load_data('test')
        test_df['comment_text'] = test_df['comment_text'].apply(unidecode)

        vectorizer = self.build_vectorizer(train_df, test_df)

        folds = common.stratified_kfold(train_df, random_seed=self.random_seed)
        for fold_num, train_ids, val_ids in folds:
            logger.info(f'Fold #{fold_num}')

            fold_train_df = train_df[train_df['id'].isin(train_ids)]
            fold_val_df = train_df[train_df['id'].isin(val_ids)]
            models = self.train(fold_num, vectorizer, fold_train_df, fold_val_df)

            logger.info('Generating the out-of-fold predictions')
            path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv')
            self.predict(models, vectorizer, fold_val_df, path)

            logger.info('Generating the test predictions')
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            self.predict(models, vectorizer, test_df, path)

        logger.info('Combining the out-of-fold predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir, f'fold{fold_num}_validation.csv')
            df_part = pd.read_csv(path, usecols=['id'] + common.LABELS)
            df_parts.append(df_part)
        train_pred = pd.concat(df_parts)
        path = os.path.join(self.output_dir, 'train.csv')
        train_pred.to_csv(path, index=False)

        logger.info('Averaging the test predictions')
        df_parts = []
        for fold_num in range(1, 11):
            path = os.path.join(self.output_dir, f'fold{fold_num}_test.csv')
            df_part = pd.read_csv(path, usecols=['id'] + common.LABELS)
            df_parts.append(df_part)
        test_pred = pd.concat(df_parts).groupby('id', as_index=False).mean()
        path = os.path.join(self.output_dir, 'test.csv')
        test_pred.to_csv(path, index=False)

        logger.info('Total elapsed time - {}'.format(datetime.now() - t_start))
def init_behavior_defects_module(filename, values, trees):
    # behavior defects module use generated file
    # structure: time,speed,turns,defects,lat,lon,labels*
    train_speed_data   = cmn.label_array(cmn.load_data(filename, (1,)), values)
    train_speed_data   = train_speed_data.reshape(len(train_speed_data), 1)
    train_turns_data   = cmn.label_array(cmn.load_data(filename, (2,)), values)
    train_turns_data   = train_turns_data.reshape(len(train_turns_data), 1)
    train_defects_data = cmn.label_array(cmn.load_data(filename, (3,)), values)
    train_defects_data = train_defects_data.reshape(len(train_defects_data), 1)

    train_data = np.hstack((train_speed_data, train_turns_data, train_defects_data))
    train_predicted = cmn.label_array(cmn.load_data(filename, (6,)), values)
    bd.init_behavior_defects_module(values, trees, train_data, train_predicted)
    return
예제 #7
0
def main():
    matrices = [correct_nodata(common.load_data(path, skip=6)) for path in INPUT_PATHS]

    matrix = np.dstack(matrices)
    rows = matrix.reshape(-1, len(MAP_NAMES))

    common.save_data(rows, OUTPUT_PATH)
def load_errors():
    args = parsed_args()
    filepath = common.get_file_storage_location()+"/"+args.file
    if not os.path.isfile(filepath):
        print(filepath + " is not a file, provide an existing file")
        return
    return common.load_data(filepath)
예제 #9
0
def main1():
    addresses = load_data()
    invalid_removed = reject(is_invalid, addresses)
    valids = [address for address in invalid_removed if is_valid(address)]
    valid_count = len(valids)
    print('total valid ips:', valid_count)
    return valids
예제 #10
0
 def test_sqs_policies(self):
     policies = load_data('iam/sqs-policies.json')
     for p, expected in zip(
             policies, [False, True, True, False,
                        False, False, False, False]):
         violations = check_cross_account(p, set(['221800032964']))
         self.assertEqual(bool(violations), expected)
예제 #11
0
 def test_sqs_policies(self):
     policies = load_data('iam/sqs-policies.json')
     for p, expected in zip(
             policies,
         [False, True, True, False, False, False, False, False]):
         violations = check_cross_account(p, set(['221800032964']))
         self.assertEqual(bool(violations), expected)
예제 #12
0
def main():
    screen = Screen(50, 6)
    for line in load_data():
        screen.execute_instruction(line)
    print('Part 1: total pixels on:', sum(screen))
    # Part 2 is just reading the output
    print(screen)
예제 #13
0
파일: main.py 프로젝트: 0FuzzingQ/xssee
def get_fuzzing(target, ck, data):
    print '[*]now demo test get xss......'
    parsed_tuple = urlparse.urlparse(urllib.unquote(target))
    url_query = urlparse.parse_qs(parsed_tuple.query, True)
    print url_query
    for i in url_query.keys():
        query = str(i) + '=' + str(url_query[i][0])
        tmp = query + flag
        location = str(url_query[i][0]) + flag

        now_target = target.replace(query, tmp)
        #data = load_data(agent_list,ck)
        try:
            #print 1
            data = load_data(agent_list, ck)
            #print data
            req = urllib2.Request(now_target, data=data)
            #print 3
            res = urllib2.urlopen(req)
            #print 4
            content_html = res.read()
            #print 5
            if flag in content_html or location in content_html:
                #print 6
                get_detect(now_target, ck, flag, True)
            else:
                return False
        except:
            pass
예제 #14
0
def main(argv):
    options= argparser().parse_args()

    model, tokenizer, labels, config = load_trained_model(options.model_dir)
    test_texts, test_labels = load_data(
        options.test_data,
        options.input_format,
        config.multiclass
    )

    label_encoder = MultiLabelBinarizer(classes=labels)

    tokenize = make_tokenization_function(tokenizer, config.seq_len)

    metrics_values = model.evaluate(
        tokenize(test_texts),
        label_encoder.fit_transform(test_labels),
        batch_size=options.batch_size
    )    
    for name, value in zip(model.metrics_names, metrics_values):
        print(f'{name}\t{value}')

    predictions = model.predict(
        tokenize(test_texts),
        verbose=1,
        batch_size=options.batch_size
    )

    assert len(test_texts) == len(predictions)
    for text, gold, preds in zip(test_texts, test_labels, predictions):
        if config.multiclass:
            pred_labels = [labels[preds.argmax()]]
        else:
            pred_labels = [labels[i] for i, v in enumerate(preds) if v > 0.5]
        print('{}\t{}'.format(','.join(pred_labels), text))
예제 #15
0
def main():
    print("Loading...")
    fullset = common.load_data(FULLSET_PATH, sep=',')

    types = get_types(fullset)

    print("Predicting...")
    uncertain_mask = (types == UNCERTAIN_LABEL)
    uncertainset = fullset[uncertain_mask]
    probs = get_probs_for_uncertain(uncertainset)
    linenum_to_probs = {
        idx: prob
        for idx, prob in zip(np.nonzero(uncertain_mask)[0], probs)
    }

    print("Deciding...")
    probs_and_predictions = []
    for i, (row, type_) in enumerate(zip(fullset, types)):
        if type_ == UNCERTAIN_LABEL:
            probs = linenum_to_probs[i].tolist()
            prediction, order = check_and_decide(row[:common.N_DISASTER],
                                                 probs)
            probs_and_predictions.append(probs + [prediction] + [order + 1])
        elif type_ == -99:
            probs_and_predictions.append([-99] * (common.N_CLASS + 2))
        else:
            probs = [0.0] * common.N_CLASS
            probs[type_] = 1.0
            probs_and_predictions.append(probs + [type_] + [0])

    print("Saving...")
    common.save_data(
        np.concatenate((types[:, np.newaxis], probs_and_predictions), axis=1),
        OUTPUT_PATH)
def load_errors():
    args = parsed_args()
    filepath = common.found_errors_storage_location() + "/" + args.file
    if not os.path.isfile(filepath):
        print(filepath + " is not a file, provide an existing file")
        return
    return common.load_data(filepath)
예제 #17
0
def main2():
    # same as before but with register c starting at 1
    registers = dict.fromkeys('abd', 0)
    registers['c'] = 1

    instructions = [line for line in load_data()]
    registers = execute(instructions, registers)
    result = registers['a']
    print("register a:", result)
예제 #18
0
def read_tags(text_file, fact_file, fact_type):
    """Returns the text as a unicode string as well as a dictionary with the various kinds
    of tags."""
    (text, tags) = load_data(text_file, fact_file, fact_type)
    if fact_type == 'BAE':
        structures = tags_with_name(tags, 'STRUCTURE')
        tag_dictionary = read_tags_bae(structures)
    else:
        tag_dictionary = read_tags_basic(tags)
    return (text, tag_dictionary)
예제 #19
0
def main():
    parser = ArgumentParser(prog="listen")
    parser.add_argument("user")
    parser.add_argument("-n", "--host", default="127.0.0.1", help="hostname of inboxes")
    args = parser.parse_args()

    users = common.load_data("users.pickle")

    assert args.user in users
    Reactor(GetBarks(args.user, args.host)).run()
예제 #20
0
def main(argv):
    options = argparser().parse_args(argv[1:])

    train_texts, train_labels = load_data(options.train, options.input_format,
                                          options.multiclass)
    dev_texts, dev_labels = load_data(options.dev, options.input_format,
                                      options.multiclass)
    num_train_examples = len(train_texts)

    label_encoder = MultiLabelBinarizer()
    label_encoder.fit(train_labels)
    train_Y = label_encoder.transform(train_labels)
    dev_Y = label_encoder.transform(dev_labels)
    num_labels = len(label_encoder.classes_)

    classifier, tokenizer, optimizer, config = prepare_classifier(
        num_train_examples, num_labels, options)
    config.multiclass = options.multiclass

    tokenize = make_tokenization_function(tokenizer, options.seq_len)
    train_X = tokenize(train_texts)
    dev_X = tokenize(dev_texts)

    history = classifier.fit(
        train_X,
        train_Y,
        epochs=options.epochs,
        batch_size=options.batch_size,
        validation_data=(dev_X, dev_Y),
    )

    metrics_values = classifier.evaluate(dev_X,
                                         dev_Y,
                                         batch_size=options.batch_size)
    for name, value in zip(classifier.metrics_names, metrics_values):
        print(f'{name}\t{value}')

    if options.save_model is not None:
        save_trained_model(options.save_model, classifier, tokenizer,
                           label_encoder.classes_, config)

    return 0
예제 #21
0
def main():
    print('Loading...')
    fullset = common.load_data(FULLSET_PATH, sep=',')

    print('Processing...')
    countmap = get_uncertain_count_map(fullset)

    print('Saving...')
    common.save_map(countmap.reshape(common.N_ROWS, -1), OUTPUT_PATH)

    print('Done!')
예제 #22
0
파일: main.py 프로젝트: 0FuzzingQ/xssee
def post_fuzzing(target, ck, data):
    print '[*]now demo test post xss......'
    try:
        data = load_data(agent_list, ck)
        req = urllib2.Request(target, data=data)
        res = urllib2.urlopen(req)
        content = res.read()
        if res.code == 301 or res.code == 302:
            print '[*]we get a 301/302 when connect to the target,please recheck it'
            exit()
        elif res.code == 200:
            param_method = raw_input(
                '[*]you can provide params for accuracy or auto find by xssee: [0]provide [1]auto'
            )
            if param_method == '1':
                print 'now detect params......'
                content = BeautifulSoup(content, 'html.parser')
                #print '111'
                input_list = content.select('input')
                for i in range(0, len(input_list)):
                    name = input_list[i]['name']
                    in_type = input_list[i]['type']
                    print name, in_type
            elif param_method == '0':
                print '[*]post data like: id=1&name=2 etc'
                post_str = raw_input('[*]please input post data:')
                param_list = post_str.strip().split('&')
                param_dict = {}

                for i in range(0, len(param_list)):
                    param_dict[param_list[i].strip().split('=')
                               [0]] = param_list[i].strip().split('=')[1]
                for i in param_dict.keys():
                    param_dict[i] = param_dict[i] + flag
                    #post_data = urllib.urlencode(param_dict)
                    try:
                        post_data = load_post_data(agent_list, ck, param_dict)

                        req = urllib2.Request(target, data=post_data)
                        res = urllib2.urlopen(req)
                        content_html = res.read()
                        if flag in content_html or param_list[
                                i] in content_html:
                            print 'ok!'
                            post_detect(target, ck, flag, param_dict, True)
                        param_dict[i] = param_dict[i].replace(flag, '')

                    except:
                        pass
                        #print '[*]fuzz post test lose connect'

    except:
        print '[*]connect failed to target'
        exit()
예제 #23
0
    def on_timer_task(self, event):
        now = time.time()
        if now - self.last_user_reread > self.user_reread_period:
            self.users = common.load_data("users.pickle")
            self.last_user_reread = now

        bark = self.make_random_bark()
        sender = self.linker.sender("//%s/outbox/%s" % (self.hostname, bark.user))
        sender.send(tuple(bark))

        event.reactor.schedule(self.bark_period, self)
예제 #24
0
def main():
    print('Loading...')
    fullset = common.load_data(FULLSET_PATH, sep=',')

    print('Processing...')
    trainset = get_trainset(fullset, upward=True)

    print('Saving...')
    common.save_data(trainset, OUTPUT_PATH)

    print('Done!')
예제 #25
0
def main_report_count_in_report_file(reports_filepath):
    reports_data = common.load_data(reports_filepath)
    if reports_data == None:
        print(reports_filepath + " has no data")
        raise ValueError(reports_filepath + " has no data")
    count = 0
    for error_type_id in generate_webpage_with_error_output.for_review():
        for e in reports_data:
            if e['error_id'] == error_type_id:
                count += 1
    return count
예제 #26
0
def main():
    parser = ArgumentParser(prog="bark")
    parser.add_argument("user")
    parser.add_argument("content", nargs="+")
    parser.add_argument("-n", "--host", default="127.0.0.1", help="hostname of outboxes")
    args = parser.parse_args()
    users = common.load_data("users.pickle")

    assert args.user in users
    content = " ".join(args.content)

    Reactor(PutBark(args.user, content, args.host)).run()
예제 #27
0
    def on_timer_task(self, event):
        now = time.time()
        if now - self.last_user_reread > self.user_reread_period:
            self.users = common.load_data("users.pickle")
            self.last_user_reread = now

        bark = self.make_random_bark()
        sender = self.linker.sender("//%s/outbox/%s" %
                                    (self.hostname, bark.user))
        sender.send(tuple(bark))

        event.reactor.schedule(self.bark_period, self)
예제 #28
0
def main():
    parser = ArgumentParser(prog="listen")
    parser.add_argument("user")
    parser.add_argument("-n",
                        "--host",
                        default="127.0.0.1",
                        help="hostname of inboxes")
    args = parser.parse_args()

    users = common.load_data("users.pickle")

    assert args.user in users
    Reactor(GetBarks(args.user, args.host)).run()
예제 #29
0
파일: evaluation.py 프로젝트: nptit/kaggle
def main():
    train_df = common.load_data('train')
    path = [common.OUTPUT_DIR]
    for name in os.listdir(os.path.join(*path)):
        if not os.path.isdir(os.path.join(*path, name)):
            continue
        path.append(name)
        for random_seed in os.listdir(os.path.join(*path)):
            if not os.path.isdir(os.path.join(*path, random_seed)):
                continue
            path.append(random_seed)
            results = []
            for params_str in os.listdir(os.path.join(*path)):
                if not os.path.isdir(os.path.join(*path, params_str)):
                    continue
                path.append(params_str)
                model_results = OrderedDict({'name': name})
                for param in sorted(params_str.split('_')):
                    try:
                        k, v = param.split('=')
                        k = k.replace('-', '_')
                        model_results[k] = v
                    except ValueError:
                        pass
                scores = []
                for fold_num in range(1, 11):
                    fold_csv = os.path.join(*path,
                                            f'fold{fold_num}_validation.csv')
                    if os.path.isfile(fold_csv):
                        output = pd.read_csv(fold_csv).sort_values('id')
                        target = train_df[train_df['id'].isin(
                            output['id'])].sort_values('id')
                        assert (
                            output['id'].values == target['id'].values).all()
                        output = output[common.LABELS].values
                        target = target[common.LABELS].values
                        score = roc_auc_score(target, output, average='macro')
                        model_results[f'fold{fold_num}'] = score
                        scores.append(score)
                if scores:
                    model_results['mean'] = np.mean(scores)
                    model_results['std'] = np.std(scores)
                results.append(model_results)
                path.pop()
            if results:
                results = pd.DataFrame(results).sort_values('mean',
                                                            ascending=False)
                results.to_csv(os.path.join(*path, 'evaluation.csv'),
                               index=False)
            path.pop()
        path.pop()
def main():
    ids = load_data(favorite_artists[CURRENT_USER])
    artist_names = load_data_at(favorite_artists[CURRENT_USER], 1)
    new_songs = []
    index = 0
    num_ids = str(len(ids))
    for artist_id in ids:
        artist_name = artist_names[index]
        index = index + 1
        print(str(index) + '/' + num_ids + ': ' + artist_name)
        #        try:
        time.sleep(PAUSE_TIME)
        results = sp.artist_albums(artist_id, album_type='album')
        new_songs.extend(get_recent_tracks(results, artist_id))
        #        except Exception as e:
        #            print(e)
        #        try:
        time.sleep(PAUSE_TIME)
        results = sp.artist_albums(artist_id, album_type='single')
        new_songs.extend(get_recent_tracks(results, artist_id))
#        except Exception as e:
#            print(e)
    print('New songs: ' + str(len(new_songs)))

    print('Removing duplicates...')
    new_songs = remove_duplicates(new_songs)
    print('New songs: ' + str(len(new_songs)))

    if CURRENT_USER == ALEJANDRO:
        print('Removing tracks in history...')
        new_songs = remove_history_tracks(new_songs)
        print('New songs: ' + str(len(new_songs)))

    main_bucket = []
    remix_bucket = []
    for track in new_songs:
        name = track['name']
        add_to_main = True
        for indicator in REMIX_INDICATORS:
            if indicator in name.lower():
                remix_bucket.append(track)
                add_to_main = False
        if add_to_main:
            main_bucket.append(track)
    print('main_bucket: ' + str(len(main_bucket)))
    print('remix_bucket: ' + str(len(remix_bucket)))
    playlist_id = create_playlist(sp, playlist_title[CURRENT_USER],
                                  'All new music released after last friday')
    add_to_playlist(playlist_id, tracks_to_ids(main_bucket))
    add_to_playlist(playlist_id, tracks_to_ids(remix_bucket))
예제 #31
0
def main1():
    '''find most common char in each column'''
    data = load_data()
    columns = zip(*data)

    result = []
    for col in columns:
        count = Counter()
        for char in col:
            count.update(char)
        mode = count.most_common(1)[0][0]
        result.append(mode)

    print('final answer:', ''.join(result))
예제 #32
0
def main():
    trainset = common.load_data(TRAINSET_PATH, sep=',')
    trainset = common.onehot_encode(trainset, 0)

    for i in range(N_MODEL):
        x_train, x_test, y_train, y_test = common.split(trainset, i)
        x_train, x_test = common.normalize(x_train, x_test)

        model, history = train(x_train, y_train, N_EPOCH)
        model.evaluate(x_test, y_test)

        model.save(common.numbering(MODEL_PATH, i))
        save_history(history, common.numbering(HISTORY_PATH, i))

        print(i, ' is done.')
예제 #33
0
def main():
    fullset = common.load_data(FULLSET_PATH, sep=',')

    clust_samples = get_clust_samples(fullset)

    km = KModes(n_clusters=N_CLUST, n_init=N_INIT, init='Huang', verbose=True)
    clust_labels = km.fit_predict(clust_samples)

    label_to_codes = get_label_to_codes(clust_samples, clust_labels)

    with open(JSON_PATH, 'w') as f:
        json.dump(label_to_codes, f, sort_keys=True)

    common.save_data([[km.cost_]] + km.cluster_centroids_.tolist(),
                     RESULT_PATH)
def main():
    while True:
        city, month, day, raw_data = ms.new_get_filters()
        df = cf.load_data(city, month, day)

        ms.show_sample_data(df, raw_data)
        # using default_data function help to get data without filtering month and day
        ms.time_stats(cf.default_data(city))
        ms.station_stats(df)
        ms.trip_duration_stats(df)
        ms.user_stats(df, city)

        restart = input('\nWould you like to restart? Enter yes or no.\n')
        if restart.lower() != 'yes':
            break
예제 #35
0
def main2():
    '''
    now the least common
    '''
    data = load_data()
    columns = zip(*data)

    result = []
    for col in columns:
        count = Counter()
        for char in col:
            count.update(char)
        mode = count.most_common()[-1][0]
        result.append(mode)

    print('final answer part 2:', ''.join(result))
예제 #36
0
def main():
    basedir = '/home/nadav/studies/mapping_and_perception_autonomous_robots/kitti_data/orginaized_data'
    date = '2011_09_30'
    dataset_number = '0033'

    result_dir = r'/home/nadav/studies/mapping_and_perception_autonomous_robots/project_2/results'
    cur_date_time = time.strftime("%Y.%m.%d-%H.%M")

    result_dir_timed = os.path.join(result_dir, f'{cur_date_time}')
    print(f'saving to: {result_dir_timed}')
    os.makedirs(result_dir_timed, exist_ok=True)

    data = load_data(basedir, date, dataset_number)

    # Q2
    extended_kalman_filter(result_dir_timed, data)
예제 #37
0
def add_recent_tracks(results, artist_id):
    black_list = load_data('black_list_artists.csv')
    if artist_id in black_list:
        return
    for result in results['items']:
        if should_add_to_list(result):
            time.sleep(PAUSE_TIME)
            print('release_date: ' + result['release_date'] + ' id: ' +
                  result['id'] + ' name: ' + result['name'] +
                  ' album_group: ' + result['album_group'] + ' album_type: ' +
                  result['album_type'])
            if result['release_date'] not in accepted_dates:
                accepted_dates.append(result['release_date'])
            time.sleep(PAUSE_TIME)
            track_ids = get_album_track_ids(sp.album(result['uri']), artist_id)
            add_to_playlist(track_ids)
예제 #38
0
def get_probs_for_uncertain(uncertainset):
    trainset = common.load_data(TRAINSET_PATH, sep=',')

    encoded_uncertainset = common.onehot_encode(
        uncertainset[:, common.N_DISASTER:], 0)
    encoded_trainset = common.onehot_encode(trainset, 0)

    prob_sums = np.zeros((len(uncertainset), common.N_CLASS))
    for i in range(N_MODEL):
        x_train, _, _, _ = common.split(encoded_trainset, i)
        _, normalized_uncertainset = common.normalize(x_train,
                                                      encoded_uncertainset)
        prob_sums += tf.keras.models.load_model(common.numbering(
            MODEL_PATH, i)).predict(normalized_uncertainset)
        print(f'{i} is done.')

    return prob_sums / N_MODEL
예제 #39
0
def read_file():
    """Reads and returns childs and last_upd from the data file."""
    data_lst = common.load_data()
    if data_lst:
        if common.DATA_FORMAT == common.JSON:
            childs, last_upd_str = data_lst
            # mask is ISO format
            last_upd = dt.datetime.strptime(last_upd_str, '%Y-%m-%d').date()
        elif common.DATA_FORMAT == common.PKL:
            childs, last_upd = data_lst
        else:
            # error
            pass
    else:
        childs = None
        last_upd = None
    return childs, last_upd
예제 #40
0
def preprocessMain(stopword=False, basic_word = True, lemmatize=True):
  # Preprocess train data
  records = load_data('data/reviews.tsv')

  preprocess_records = [preprocess(record, stopword=stopword, basic_word = basic_word, lemmatize=lemmatize) for record in records]

  with open('data/preprocessed_reviews.tsv', 'w') as preprocess_file:
    header = 'id\treview\tsentiment\n'
    preprocess_file.write(header)

    for record in records:
      try:
        preprocess_file.write('%s\t%s\t%i\n' %
                            (record['id'].decode('UTF-8'), record['review'].decode('UTF-8'), record['sentiment']))
      except UnicodeEncodeError:
        print("unicode encode error")
        continue
  import sys
  sys.stdout.flush()
def make_query_to_reload_only_affected_objects(input_filename_with_reports, output_query_filename):
    input_filepath = common.get_file_storage_location() + "/" + input_filename_with_reports
    output_filepath = root() + 'reload_querries/' + output_query_filename
    if not os.path.isfile(input_filepath):
        print("file not found")
        return
    directory_path = os.path.split(output_filepath)[0]
    pathlib.Path(directory_path).mkdir(parents=True, exist_ok=True)
    archived_filepath = output_filepath + "-archived-" + str(datetime.datetime.now()) + ".query"
    try:
        move_file(output_filepath, archived_filepath)
    except FileNotFoundError:
        pass # it is OK, it just means that we are running for the first time or cache was deleted
    with open(output_filepath, 'w') as query_file:
        all_errors = []
        for e in common.load_data(input_filepath):
            if e['error_id'] not in all_errors:
                all_errors.append(e['error_id'])
        query = common.get_query_for_loading_errors_by_category(filename = input_filename_with_reports, printed_error_ids = all_errors, format = "josm")
        query_file.write(query)
예제 #42
0
def main(params=None):

    if params is None:
        params = {
            'dataset': 'DRLD',
            'exp_name': 'char_test',
            'test_fold': 0,
            'n_dev_folds': 1,
            'min_doc_thresh': 1,
            'initialize_word_vectors': True,
            'vectors': 'chars_word2vec_25',  # default_word2vec_300, anes_word2vec_300, chars_word2vec_25, eye_1 ...
            'init_scale': 0.2,
            'add_OOV_dim': True,
            'win': 1,                   # size of context window
            'add_DRLD': True,
            'rnn_type': 'basic',        # basic, GRU, or LSTM
            'n_hidden': 50,             # size of hidden units
            'pooling_method': 'max',    # max, mean, or attention1/2
            'bidirectional': True,
            'bi_combine': 'concat',        # concat, max, or mean
            'train_embeddings': True,
            'lr': 0.1,                  # learning rate
            'lr_emb_fac': 1,            # factor to modify learning rate for embeddings
            'decay_delay': 10,           # number of epochs with no improvement before decreasing learning rate
            'decay_factor': 0.5,        # factor by which to multiply learning rate in case of delay
            'n_epochs': 300,
            'add_OOV_noise': True,
            'OOV_noise_prob': 0.01,
            'minibatch_size': 16,
            'classify_minibatch_size': 64,
            'ensemble': False,
            'save_model': True,
            'seed': 42,
            'verbose': 1,
            'reuse': False,
            'orig_T': 0.04,
            'tau': 0.01,
            'clip_gradients': False
        }

    #params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json')
    #params['exp_name'] += '_best'
    #params['n_hidden'] = int(params['n_hidden'])

    keys = params.keys()
    keys.sort()
    for key in keys:
        print key, ':', params[key]

    # seed the random number generators
    np.random.seed(params['seed'])
    random.seed(params['seed'])

    vector_type = params['vectors'].split('_')[0]
    params['word2vec_dim'] = int(params['vectors'].split('_')[-1])


    reuser = None
    if params['reuse']:
        reuser = reusable_holdout.ReuseableHoldout(T=params['orig_T'], tau=params['tau'])

    if params['dataset'] == 'DRLD':
        datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes']
    elif params['dataset'] == 'MIP':
        datasets = ['MIP-Personal-1', 'MIP-Personal-2', 'MIP-Political-1', 'MIP-Political-2']
    elif params['dataset'] == 'MOLD':
        datasets = ['McCain-Likes', 'McCain-Dislikes', 'Obama-Likes', 'Obama-Dislikes']
    elif params['dataset'] == 'Primary':
        datasets = ['Obama-Primary', 'Clinton-Primary']
    elif params['dataset'] == 'General':
        datasets = ['Obama-General', 'McCain-General']
    else:
        datasets = [params['dataset']]

    np.random.seed(params['seed'])
    random.seed(params['seed'])

    best_valid_f1s = []
    best_true_valid_f1s = []
    best_test_f1s = []
    best_train_f1s = []

    test_prediction_arrays = []

    output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'])
    output_filename = fh.make_filename(output_dir, 'params', 'txt')
    fh.write_to_json(params, output_filename)

    for dev_fold in range(params['n_dev_folds']):
        print "dev fold =", dev_fold

        output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'], 'fold' + str(dev_fold))

        if vector_type == 'chars':
            all_data, words2idx, items, all_labels = common.load_char_data(datasets, params['test_fold'], dev_fold)
        else:
            all_data, words2idx, items, all_labels = common.load_data(datasets, params['test_fold'], dev_fold,
                                                                      params['min_doc_thresh'])
        train_xy, valid_xy, test_xy = all_data
        train_lex, train_y = train_xy
        valid_lex, valid_y = valid_xy
        test_lex, test_y = test_xy


        #if params['minibatch_size'] > 1 or params['classify_minibatch_size'] > 1:
        print "padding input with zeros"
        all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex)
        train_lex, valid_lex, test_lex = all_data
        train_masks, valid_masks, test_masks = all_masks
        #else:
        #    train_masks = [np.ones(len(x)).astype('int32') for x in train_lex]
        #    valid_masks = [np.ones(len(x)).astype('int32') for x in valid_lex]
        #    test_masks = [np.ones(len(x)).astype('int32') for x in test_lex]

        print "expanding x with context win dows"
        # Rejigger to convert x to contex win in advance
        train_x_win = expand_x_with_context_win(train_lex, params['win'])
        valid_x_win = expand_x_with_context_win(valid_lex, params['win'])
        test_x_win = expand_x_with_context_win(test_lex, params['win'])
        order = range(len(train_lex))
        print "done"

        train_items, dev_items, test_items = items
        vocsize = len(words2idx.keys())
        idx2words = dict((k, v) for v, k in words2idx.iteritems())
        best_test_predictions = None

        n_sentences = len(train_lex)
        print "vocsize = ", vocsize, 'n_train', n_sentences

        codes = all_labels.columns
        n_items, n_codes = all_labels.shape

        # get the words in the sentences for the test and validation sets
        words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex]
        groundtruth_test = test_y[:]
        words_test = [map(lambda x: idx2words[x], w) for w in test_lex]

        #if vector_type == 'eye':
        #    initial_embeddings = np.eye(vocsize)
        #    emb_dim = initial_embeddings.shape[1]
        if params['initialize_word_vectors']:
            initial_embeddings = common.load_embeddings(params, words2idx)
            emb_dim = initial_embeddings.shape[1]
        else:
            initial_embeddings = None
            emb_dim = params['word2vec_dim']
        print "embedding dim =", emb_dim


        temp_output = fh.make_filename(output_dir, 'embedding_labels', 'json')
        fh.write_to_json(idx2words, temp_output)


        extra_input_dims = 0
        if params['add_DRLD']:
            extra_input_dims = 2

        print "Building RNN"
        rnn = RNN(nh=params['n_hidden'],
                  nc=n_codes,
                  ne=vocsize,
                  de=emb_dim,
                  cs=params['win'],
                  extra_input_dims=extra_input_dims,
                  initial_embeddings=initial_embeddings,
                  init_scale=params['init_scale'],
                  rnn_type=params['rnn_type'],
                  train_embeddings=params['train_embeddings'],
                  pooling_method=params['pooling_method'],
                  bidirectional=params['bidirectional'],
                  bi_combine=params['bi_combine'],
                  clip_gradients=params['clip_gradients']
                  )

        temp_filename = fh.make_filename(output_dir, 'initial_embeddings', 'npy')
        rnn.save_embeddings(temp_filename)

        train_likes = [1 if re.search('Likes', i) else 0 for i in train_items]
        dev_likes = [1 if re.search('Likes', i) else 0 for i in dev_items]
        test_likes = [1 if re.search('Likes', i) else 0 for i in test_items]

        train_dem = [1 if re.search('Democrat', i) else 0 for i in train_items]
        dev_dem = [1 if re.search('Democrat', i) else 0 for i in dev_items]
        test_dem = [1 if re.search('Democrat', i) else 0 for i in test_items]

        train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)]
        dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)]
        test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)]



        ### LOAD
        #rnn.load(output_dir)

        # train with early stopping on validation set
        best_f1 = -np.inf
        params['clr'] = params['lr']
        for e in xrange(params['n_epochs']):
            # shuffle
            #shuffle([train_lex, train_y, train_extra, train_masks], params['seed'])   # shuffle the input data
            shuffle([order, train_lex, train_y, train_extra, train_masks], params['seed'])   # shuffle the input data
            params['ce'] = e                # store the current epoch
            tic = timeit.default_timer()

            ms = params['minibatch_size']
            n_train = len(train_lex)
            nll = 0

            #for i, orig_x in enumerate(train_lex):
            for iteration, i in enumerate(range(0, n_train, ms)):
                #orig_x = train_lex[i]
                #n_words = len(orig_x)
                #if params['add_OOV_noise']:
                #    draws = np.random.rand(n_words)
                #    x = [OOV_index if draws[i] < params['OOV_noise_prob'] else orig_x[i] for i in range(n_words)]
                #else:
                #    x = orig_x
                #y = train_y[i]
                extra = train_extra[i]
                #mask = train_masks[i]

                minibatch_x, minibatch_mask,\
                minibatch_extra, minibatch_y= select_minibatch(train_x_win, train_masks, train_extra, train_y,
                                                               params['win'], i, ms, order,
                                                               params['add_OOV_noise'], params['OOV_noise_prob'])

                #if i == 0:
                #    print '\n'.join([' '.join([idx2words[idx] for idx in minibatch_x[:, k, 0].tolist()]) for
                #           k in range(ms)])

                nll_i, a_sum = rnn.train(minibatch_x, minibatch_mask, minibatch_y, params['win'],
                                params['clr'],
                                params['lr_emb_fac'], extra_input_dims, minibatch_extra)
                nll += nll_i
                #rnn.train(x, mask, y, params['win'], params['clr'], params['lr_emb_fac'],
                #          extra_input_dims, extra)
                print '[learning] epoch %i >> %2.2f%%' % (
                    e, (i + 1) * 100. / float(n_sentences)),
                print 'completed in %.2f (sec), nll = %.2f, a_sum = %.1f <<\r' % (timeit.default_timer() - tic,
                                                                                  nll, np.max(a_sum)),
                sys.stdout.flush()

                if np.isnan(nll) or np.isinf(nll):
                    if best_f1 > 0:
                        break
                    else:
                        return {'loss': 1.0,
                                'final_test_f1': 0,
                                'valid_f1s': 0,
                                'true_valid_f1s': 0,
                                'train_f1s': 0,
                                'test_f1s': 0,
                                'status': STATUS_OK
                                }

            # evaluation // back into the real world : idx -> words
            print ""

            #print "true y", train_y[-1]
            #y_pred = rnn.classify(np.array(train_x_win[-1]).reshape((1, len(train_x_win[-1]))),
            #                      train_masks[-1], params['win'], extra_input_dims, train_extra[-1])[0]
            #print "pred y", y_pred

            #if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2':
            #    if extra_input_dims == 0:
            #        r = np.random.randint(0, len(train_lex))
            #        print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32'))

            predictions_train = predict(n_train, params['classify_minibatch_size'], train_x_win, train_masks,
                                         train_y, params['win'], extra_input_dims, train_extra, rnn, order)
            n_valid = len(valid_lex)
            n_test = len(test_lex)
            predictions_valid = predict(n_valid, params['classify_minibatch_size'], valid_x_win, valid_masks,
                                        valid_y, params['win'], extra_input_dims, dev_extra, rnn)
            predictions_test = predict(n_test, params['classify_minibatch_size'], test_x_win, test_masks,
                                        test_y, params['win'], extra_input_dims, test_extra, rnn)

            """
            predictions_train = [rnn.classify(x, train_masks[i], params['win'],
                                              extra_input_dims, train_extra[i])[0] for i, x in enumerate(train_lex)]
            predictions_valid = [rnn.classify(x, valid_masks[i], params['win'],
                                              extra_input_dims, dev_extra[i])[0] for i, x in enumerate(valid_lex)]
            predictions_test = [rnn.classify(x, test_masks[i], params['win'],
                                             extra_input_dims, test_extra[i])[0] for i, x in enumerate(test_lex)]
            """

            train_f1 = common.calc_mean_f1(predictions_train, train_y)
            test_f1 = common.calc_mean_f1(predictions_test, test_y)
            valid_f1 = common.calc_mean_f1(predictions_valid, valid_y)

            question_f1s = []
            question_pps = []

            print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1

            if valid_f1 > best_f1:
                best_rnn = copy.deepcopy(rnn)
                best_f1 = valid_f1
                best_test_predictions = predictions_test

                if params['verbose']:
                    print('NEW BEST: epoch', e,
                          'valid f1', valid_f1,
                          'best test f1', test_f1)

                params['tr_f1'] = train_f1
                params['te_f1'] = test_f1
                params['v_f1'] = valid_f1
                params['be'] = e            # store the current epoch as a new best

            # learning rate decay if no improvement in a given number of epochs
            if abs(params['be']-params['ce']) >= params['decay_delay']:
                params['clr'] *= params['decay_factor']
                params['be'] = params['ce']
                print "Reverting to current best; new learning rate = ", params['clr']
                # also reset to the previous best
                rnn = best_rnn

            if params['clr'] < 1e-5:
                break

            if best_f1 == 1.0:
                break

            if best_f1 == 0 and e > 7:
                break

        if params['save_model']:
            predictions_test = predict(len(test_y), params['classify_minibatch_size'], test_x_win, test_masks,
                                       test_y, params['win'], extra_input_dims, test_extra, best_rnn)
            best_rnn.save(output_dir)
            common.write_predictions(datasets, params['test_fold'], dev_fold, predictions_test, test_items, output_dir)

        print('BEST RESULT: epoch', params['be'],
              'train F1 ', params['tr_f1'],
              'valid F1', params['v_f1'],
              'best test F1', params['te_f1'],
              'with the model', output_dir)


        best_true_valid_f1s.append(params['v_f1'])
        best_test_f1s.append(params['te_f1'])
        best_train_f1s.append(params['tr_f1'])
        if reuser is not None:
            best_valid_f1 = reuser.mask_value(params['v_f1'], params['tr_f1'])
        else:
            best_valid_f1 = params['v_f1']
        best_valid_f1s.append(best_valid_f1)


        test_prediction_arrays.append(np.array(best_test_predictions, dtype=int))

    params['ensemble'] = False
    if params['ensemble']:
        test_predictions_stack = np.dstack(test_prediction_arrays)
        final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0]
        predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes)
        true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes)
        final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df)
    else:
        final_test_f1 = np.median(best_test_f1s)

    return {'loss': -np.median(best_valid_f1s),
            'final_test_f1': final_test_f1,
            'valid_f1s': best_valid_f1s,
            'train_f1s': best_train_f1s,
            'true_valid_f1s': best_true_valid_f1s,
            'test_f1s': best_test_f1s,
            'status': STATUS_OK
            }
예제 #43
0
def test_behavior_defects_module(filename, plot, test_type):
    # initialize modules
    train_values    = 1
    train_trees     = 10
    filename_train  = "train_data/behavior_defects_data.output"
    init_server.init_behavior_defects_module(filename_train, train_values, train_trees)
    
    if test_type == "full": # generate new dataset
        print bcolors.HEADER + "initialize dependent modules" + bcolors.ENDC
        init_server.init_speed_module   ("train_data/speed_acc_data.output",  10, 15)
        init_server.init_turns_module   ("train_data/turns_com_data.output",   5, 10)
        init_server.init_defects_module ("train_data/defects_acc_data.output", 5, 15)
        print bcolors.OKGREEN + "Done! " + bcolors.ENDC
 
        # load dependent test data and classify actions
        # structure: time,accx,accy,accz,compass,lat,lon,speed
        test_values = 5
        
        print bcolors.HEADER + "Start getting speed data" + bcolors.ENDC
        test_speed_data = cmn.aver_std_array(cmn.load_data(filename, (2,)), test_values)
        test_speed_data = test_speed_data.reshape(len(test_speed_data)/2, 2)
        predicted_speed = sp.predicted(test_speed_data)
        predicted_speed = predicted_speed.reshape(len(predicted_speed), 1)
        print bcolors.OKGREEN + "Done! " + bcolors.ENDC#, predicted_speed

        print bcolors.HEADER + "Start getting turns data" + bcolors.ENDC
        test_turns_data = cmn.get_diff_array(cmn.load_data(filename, (4,)))
        test_turns_data = cmn.aver_std_array(test_turns_data, test_values)
        test_turns_data = test_turns_data.reshape(len(test_turns_data)/2, 2)
        predicted_turns = tr.predicted(test_turns_data)
        predicted_turns = predicted_turns.reshape(len(predicted_turns), 1)
        print bcolors.OKGREEN + "Done! " + bcolors.ENDC#, predicted_turns
    
        print bcolors.HEADER + "Start getting defects data" + bcolors.ENDC
        test_defects_data = cmn.aver_std_array(cmn.load_data(filename, (3,)), test_values)
        test_defects_data = test_defects_data.reshape(len(test_defects_data)/2, 2)
        predicted_defects = df.predicted(test_defects_data)
        predicted_defects = predicted_defects.reshape(len(predicted_defects), 1)
        print bcolors.OKGREEN + "Done! " + bcolors.ENDC#, predicted_defects

        print bcolors.HEADER + "Start generating test data" + bcolors.ENDC
        test_times = cmn.label_array(cmn.load_data(filename, (0,)), test_values)
        test_data  = np.hstack((predicted_speed, predicted_turns, predicted_defects))
        np.savetxt('generated_behavior_defects_data.output', test_data, delimiter=',', fmt='%i')
        print bcolors.OKGREEN + "Done! " + bcolors.ENDC#, test_data

    elif test_type == "express":  
        # use default dataset
        # structure: time,speed,turn,defect,lat,lon
        print bcolors.HEADER + "Start getting test data" + bcolors.ENDC
        test_values = 1
        test_speed_data   = cmn.label_array(cmn.load_data(filename, (1,)), values)
        test_speed_data   = test_speed_data.reshape(len(test_speed_data), 1)
        test_turns_data   = cmn.label_array(cmn.load_data(filename, (2,)), values)
        test_turns_data   = test_turns_data.reshape(len(test_turns_data), 1)
        test_defects_data = cmn.label_array(cmn.load_data(filename, (3,)), values)
        test_defects_data = test_defects_data.reshape(len(test_defects_data), 1)
        test_data = np.hstack((test_speed_data, test_turns_data, test_defects_data))

        test_times = cmn.label_array(cmn.load_data(filename, (6,)), test_values)
        print bcolors.OKGREEN + "Done! test_data:\n" + bcolors.ENDC, test_data
    else :
        print bcolor.FAIL + "behavior_defects_module: invalid test type, exit" + bcolors.ENDC
        return

    # plot result is not used currently
    #if plot == "yes" :
    #    train_speed_data   = cmn.label_array(cmn.load_data(filename_train, (1,)), train_values)
    #    train_speed_data   = train_speed_data.reshape(len(train_speed_data), 1)
    #    train_turns_data   = cmn.label_array(cmn.load_data(filename_train, (2,)), train_values)
    #    train_turns_data   = train_turns_data.reshape(len(train_turns_data), 1)
    #    train_defects_data = cmn.label_array(cmn.load_data(filename_train, (3,)), train_values)
    #    train_defects_data = train_defects_data.reshape(len(train_defects_data), 1)
    #    train_data = np.hstack((train_speed_data, train_turns_data, train_defects_data))
    #    xx, yy = cmn.get_grid(train_data[:, [0, 1]])
    #    train_predicted = bd.predicted(np.c_[xx.ravel(), yy.ravel()], 1).reshape(xx.shape)
    #    #print "Train data\n", train_data
    #    #print "Train predicted\n", train_predicted
    #    test_predicted  = bd.predicted(test_data)
    #    cmn.plot_2D_data(test_data, test_predicted, train_data, train_predicted, [0, 5.0], [0, 5.0]);

    # skip waiting (speed ~ 0)

    # check is arrays is empty

    # get new types for defects
    raw_input(bcolors.OKBLUE + "Ready to start! Press Enter to continue..." + bcolors.ENDC)
    bd.find_actions(test_data, test_times)
    
    # writing defects to DB is not used in test module
    #bd.add_defects()

    return
예제 #44
0
def main(params=None):

    if params is None:
        params = {
            'exp_name': 'minibatch_test',
            'test_fold': 0,
            'n_dev_folds': 1,
            'min_doc_thresh': 1,
            'initialize_word_vectors': True,
            'vectors': 'anes_word2vec',  # default_word2vec, anes_word2vec ...
            'word2vec_dim': 300,
            'init_scale': 0.2,
            'add_OOV': True,
            'win': 3,                   # size of context window
            'add_DRLD': False,
            'rnn_type': 'basic',        # basic, GRU, or LSTM
            'n_hidden': 3,             # size of hidden units
            'pooling_method': 'max',    # max, mean, or attention1/2
            'bidirectional': False,
            'bi_combine': 'mean',        # concat, max, or mean
            'train_embeddings': True,
            'lr': 0.1,                  # learning rate
            'lr_emb_fac': 0.2,            # factor to modify learning rate for embeddings
            'decay_delay': 5,           # number of epochs with no improvement before decreasing learning rate
            'decay_factor': 0.5,        # factor by which to multiply learning rate in case of delay
            'n_epochs': 10,
            'add_OOV_noise': False,
            'OOV_noise_prob': 0.01,
            'minibatch_size': 1,
            'ensemble': False,
            'save_model': True,
            'seed': 42,
            'verbose': 1,
            'reuse': False,
            'orig_T': 0.04,
            'tau': 0.01
        }

    # load params from a previous experiment
    params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json')
    params['exp_name'] += '_minibatch_16'
    params['n_hidden'] = int(params['n_hidden'])
    params['orig_T'] = 0.02
    params['tau'] = 0.005


    reuser = None
    if params['reuse']:
        reuser = reusable_holdout.ReuseableHoldout(T=params['orig_T'], tau=params['tau'])

    keys = params.keys()
    keys.sort()
    for key in keys:
        print key, ':', params[key]

    # seed the random number generators
    np.random.seed(params['seed'])
    random.seed(params['seed'])

    datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes']

    np.random.seed(params['seed'])
    random.seed(params['seed'])

    best_valid_f1s = []
    best_test_f1s = []

    test_prediction_arrays = []

    output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'])
    output_filename = fh.make_filename(output_dir, 'params', 'json')
    fh.write_to_json(params, output_filename)

    for dev_fold in range(params['n_dev_folds']):
        print "dev fold =", dev_fold

        output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'], 'fold' + str(dev_fold))
        results = []

        all_data, words2idx, items, all_labels = common.load_data(datasets, params['test_fold'], dev_fold,
                                                                  params['min_doc_thresh'])
        train_xy, valid_xy, test_xy = all_data
        train_lex, train_y = train_xy
        valid_lex, valid_y = valid_xy
        test_lex, test_y = test_xy
        train_items, dev_items, test_items = items
        vocsize = len(words2idx.keys())
        idx2words = dict((k, v) for v, k in words2idx.iteritems())
        best_test_predictions = None

        n_sentences = len(train_lex)
        print "vocsize = ", vocsize, 'n_train', n_sentences

        codes = all_labels.columns
        n_items, n_codes = all_labels.shape

        # get the words in the sentences for the test and validation sets
        words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex]
        groundtruth_test = test_y[:]
        words_test = [map(lambda x: idx2words[x], w) for w in test_lex]

        initial_embeddings = common.load_embeddings(params, words2idx)
        OOV_index = words2idx['__OOV__']
        emb_dim = initial_embeddings.shape[1]
        print 'emb_dim =', emb_dim

        extra_input_dims = 0
        if params['add_DRLD']:
            extra_input_dims = 2

        print "Building RNN"
        rnn = RNN(nh=params['n_hidden'],
                  nc=n_codes,
                  ne=vocsize,
                  de=emb_dim,
                  cs=params['win'],
                  extra_input_dims=extra_input_dims,
                  initial_embeddings=initial_embeddings,
                  init_scale=params['init_scale'],
                  rnn_type=params['rnn_type'],
                  train_embeddings=params['train_embeddings'],
                  pooling_method=params['pooling_method'],
                  bidirectional=params['bidirectional'],
                  bi_combine=params['bi_combine']
                  )

        train_likes = [1 if re.search('Likes', i) else 0 for i in train_items]
        dev_likes = [1 if re.search('Likes', i) else 0 for i in dev_items]
        test_likes = [1 if re.search('Likes', i) else 0 for i in test_items]

        train_dem = [1 if re.search('Democrat', i) else 0 for i in train_items]
        dev_dem = [1 if re.search('Democrat', i) else 0 for i in dev_items]
        test_dem = [1 if re.search('Democrat', i) else 0 for i in test_items]

        train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)]
        dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)]
        test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)]

        # train with early stopping on validation set


        best_f1 = -np.inf
        params['clr'] = params['lr']
        for e in xrange(params['n_epochs']):
            # shuffle
            shuffle([train_lex, train_y, train_extra], params['seed'])   # shuffle the input data
            params['ce'] = e                # store the current epoch
            tic = timeit.default_timer()

            #for i, (x, y) in enumerate(zip(train_lex, train_y)):
            for i, orig_x in enumerate(train_lex):
                n_words = len(orig_x)
                if params['add_OOV_noise']:
                    draws = np.random.rand(n_words)
                    x = [OOV_index if draws[idx] < params['OOV_noise_prob'] else orig_x[idx] for idx in range(n_words)]
                else:
                    x = orig_x
                y = train_y[i]
                extra = train_extra[i]

                if i == 0:
                    print ' '.join([idx2words[w] for w in train_lex[i]])

                if i == 0:
                    print x
                    print y

                nll = rnn.train(x, y, params['win'], params['clr'], params['lr_emb_fac'],
                          extra_input_dims, extra)
                if float(i/100.0) == float(i//100):
                    print nll
                print '[learning] epoch %i >> %2.2f%%' % (
                    e, (i + 1) * 100. / float(n_sentences)),
                print 'completed in %.2f (sec) <<\r' % (timeit.default_timer() - tic),
                sys.stdout.flush()
                #if i == 0:
                #    print ' '.join([idx2words[idx] for idx in orig_x])
                #    print rnn.classify(orig_x, params['win'], extra_input_dims, extra)

                if np.isnan(nll) or np.isinf(nll):
                    return {'loss': nll,
                            'final_test_f1': 0,
                            'valid_f1s': [0],
                            'test_f1s': [0],
                            'status': STATUS_OK
                            }

            # evaluation // back into the real world : idx -> words
            print ""

            #print rnn.classify((np.asarray(contextwin(train_lex[0], params['win'])).astype('int32')), train_likes[0], params['win'])
            #print rnn.classify(train_lex[0], params['win'], extra_input_dims, train_extra[0])
            #print rnn.get_element_weights(np.asarray(contextwin(train_lex[0], params['win'])).astype('int32'))
            #if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2':
            #    if extra_input_dims == 0:
            #        r = np.random.randint(0, len(train_lex))
            #        print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32'))

            """
            predictions_train = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0)
                                 for x in train_lex]
            predictions_test = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0)
                                for x in test_lex]
            predictions_valid = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0)
                                 for x in valid_lex]
            """

            #predictions_train = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in train_lex]
            #predictions_test = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in test_lex]
            #predictions_valid = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in valid_lex]

            predictions_train = [rnn.classify(x, params['win'],
                                              extra_input_dims, train_extra[i]) for i, x in enumerate(train_lex)]
            predictions_test = [rnn.classify(x, params['win'],
                                             extra_input_dims, test_extra[i]) for i, x in enumerate(test_lex)]
            predictions_valid = [rnn.classify(x, params['win'],
                                              extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex)]

            train_f1 = common.calc_mean_f1(predictions_train, train_y)
            test_f1 = common.calc_mean_f1(predictions_test, test_y)
            valid_f1 = common.calc_mean_f1(predictions_valid, valid_y)

            if reuser is not None:
                valid_f1 = reuser.mask_value(valid_f1, train_f1)

            question_f1s = []
            question_pps = []

            print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1
            results.append((train_f1, valid_f1, test_f1))

            if valid_f1 > best_f1:
                best_rnn = copy.deepcopy(rnn)
                best_f1 = valid_f1
                best_test_predictions = predictions_test

                if params['verbose']:
                    print('NEW BEST: epoch', e,
                          'valid f1', valid_f1,
                          'best test f1', test_f1)

                params['tr_f1'] = train_f1
                params['te_f1'] = test_f1
                params['v_f1'] = valid_f1
                params['be'] = e            # store the current epoch as a new best

            # learning rate decay if no improvement in a given number of epochs
            if abs(params['be']-params['ce']) >= params['decay_delay']:
                params['clr'] *= params['decay_factor']
                params['be'] = params['ce']
                print "Reverting to current best; new learning rate = ", params['clr']
                # also reset to the previous best
                rnn = best_rnn

            if params['clr'] < 1e-5:
                break

            if best_f1 == 1.0:
                break

            if best_f1 == 0 and e > 10:
                break

        if params['save_model']:
            predictions_valid = [rnn.classify(x, params['win'],
                                              extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex)]

            #predictions_valid = [best_rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')) for x in valid_lex]
            best_rnn.save(output_dir)
            common.write_predictions(datasets, params['test_fold'], dev_fold, predictions_valid, dev_items, output_dir)

        print('BEST RESULT: epoch', params['be'],
              'train F1 ', params['tr_f1'],
              'valid F1', params['v_f1'],
              'best test F1', params['te_f1'],
              'with the model', output_dir)

        best_valid_f1s.append(params['v_f1'])
        best_test_f1s.append(params['te_f1'])

        test_prediction_arrays.append(np.array(best_test_predictions, dtype=int))

        output_filename = fh.make_filename(output_dir, 'results', 'txt')
        with codecs.open(output_filename, 'w') as output_file:
            for e, result in enumerate(results):
                output_file.write('epoch=' + str(e) + '; train_f1=' + str(result[0]) +
                                  '; valid_f1=' + str(result[1]) + '; test_f1=' + str(result[2]) + '\n')

    if params['ensemble']:
        test_predictions_stack = np.dstack(test_prediction_arrays)
        final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0]
        predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes)
        true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes)
        final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df)
    else:
        final_test_f1 = np.median(best_test_f1s)

    return {'loss': -np.median(best_valid_f1s),
            'final_test_f1': final_test_f1,
            'valid_f1s': best_valid_f1s,
            'test_f1s': best_test_f1s,
            'status': STATUS_OK
            }
예제 #45
0
        Theta1 = Theta[1] - alpha * delta
        # 値は同時に更新します
        Theta = [Theta0, Theta1]
        # 表示
        if i % 10 == 0 or i == iteration-1:
            cost = compute_cost(x_vals, y_vals, Theta, hypothesis_func)
            print("itr=%d, cost=%f, Theta0=%f, Theta1=%f" % (i, cost, Theta[0], Theta[1]))
    return Theta


if __name__ == "__main__":

    # 01. データを読み込む
    #---------------------------------------------
    # 今回利用するデータを読み込みます
    data, x_vals, y_vals = cmn.load_data()
    # 上10件ほど、見てみましょう
    print('-----------------\n#今回利用するデータ(上10件)')
    pprint(data[:10])
    # データをグラフに表示します
    cmn.show(data)


    # 02. (最適化前)予測とコストを計算する
    #---------------------------------------------
    # 初期値のシータは10にしておきます(別の値でも良いです)
    # 「Theta0 = 10, Theta1 = 10」の意味です。
    Theta = [10, 10]
    # このシータを使って、上位10件のデータの予測を作ってみましょう
    hypo = hypothesis(x_vals, Theta)
    # 上位3件の予測結果を表示します(最適化前)
예제 #46
0
def main(params=None):

    if params is None:
        params = {
            "exp_name": "minibatch_test",
            "test_fold": 0,
            "n_dev_folds": 1,
            "min_doc_thresh": 1,
            "initialize_word_vectors": True,
            "vectors": "anes_word2vec",  # default_word2vec, anes_word2vec ...
            "word2vec_dim": 300,
            "init_scale": 0.2,
            "add_OOV": True,
            "win": 3,  # size of context window
            "add_DRLD": False,
            "rnn_type": "basic",  # basic, GRU, or LSTM
            "n_hidden": 3,  # size of hidden units
            "pooling_method": "max",  # max, mean, or attention1/2
            "bidirectional": False,
            "bi_combine": "mean",  # concat, max, or mean
            "train_embeddings": True,
            "lr": 0.1,  # learning rate
            "lr_emb_fac": 0.2,  # factor to modify learning rate for embeddings
            "decay_delay": 5,  # number of epochs with no improvement before decreasing learning rate
            "decay_factor": 0.5,  # factor by which to multiply learning rate in case of delay
            "n_epochs": 10,
            "add_OOV_noise": False,
            "OOV_noise_prob": 0.01,
            "minibatch_size": 1,
            "ensemble": False,
            "save_model": True,
            "seed": 42,
            "verbose": 1,
            "reuse": False,
            "orig_T": 0.04,
            "tau": 0.01,
        }

    # load params from a previous experiment
    params = fh.read_json("/Users/dcard/Projects/CMU/ARK/guac/experiments/best_mod.json")
    params["exp_name"] += "_minibatch_16"
    params["n_hidden"] = int(params["n_hidden"])
    params["orig_T"] = 0.02
    params["tau"] = 0.005

    reuser = None
    if params["reuse"]:
        reuser = reusable_holdout.ReuseableHoldout(T=params["orig_T"], tau=params["tau"])

    keys = params.keys()
    keys.sort()
    for key in keys:
        print key, ":", params[key]

    # seed the random number generators
    np.random.seed(params["seed"])
    random.seed(params["seed"])

    datasets = ["Democrat-Likes", "Democrat-Dislikes", "Republican-Likes", "Republican-Dislikes"]

    np.random.seed(params["seed"])
    random.seed(params["seed"])

    best_valid_f1s = []
    best_test_f1s = []

    test_prediction_arrays = []

    output_dir = fh.makedirs(defines.exp_dir, "rnn", params["exp_name"])
    output_filename = fh.make_filename(output_dir, "params", "json")
    fh.write_to_json(params, output_filename)

    for dev_fold in range(params["n_dev_folds"]):
        print "dev fold =", dev_fold

        output_dir = fh.makedirs(defines.exp_dir, "rnn", params["exp_name"], "fold" + str(dev_fold))
        results = []

        all_data, words2idx, items, all_labels = common.load_data(
            datasets, params["test_fold"], dev_fold, params["min_doc_thresh"]
        )
        train_xy, valid_xy, test_xy = all_data
        train_lex, train_y = train_xy
        valid_lex, valid_y = valid_xy
        test_lex, test_y = test_xy
        train_items, dev_items, test_items = items
        vocsize = len(words2idx.keys())
        idx2words = dict((k, v) for v, k in words2idx.iteritems())
        best_test_predictions = None

        n_sentences = len(train_lex)
        print "vocsize = ", vocsize, "n_train", n_sentences

        codes = all_labels.columns
        n_items, n_codes = all_labels.shape

        # get the words in the sentences for the test and validation sets
        words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex]
        groundtruth_test = test_y[:]
        words_test = [map(lambda x: idx2words[x], w) for w in test_lex]

        initial_embeddings = common.load_embeddings(params, words2idx)
        OOV_index = words2idx["__OOV__"]
        emb_dim = initial_embeddings.shape[1]
        print "emb_dim =", emb_dim

        extra_input_dims = 0
        if params["add_DRLD"]:
            extra_input_dims = 2

        print "Building RNN"
        rnn = RNN(
            nh=params["n_hidden"],
            nc=n_codes,
            ne=vocsize,
            de=emb_dim,
            cs=params["win"],
            extra_input_dims=extra_input_dims,
            initial_embeddings=initial_embeddings,
            init_scale=params["init_scale"],
            rnn_type=params["rnn_type"],
            train_embeddings=params["train_embeddings"],
            pooling_method=params["pooling_method"],
            bidirectional=params["bidirectional"],
            bi_combine=params["bi_combine"],
        )

        train_likes = [1 if re.search("Likes", i) else 0 for i in train_items]
        dev_likes = [1 if re.search("Likes", i) else 0 for i in dev_items]
        test_likes = [1 if re.search("Likes", i) else 0 for i in test_items]

        train_dem = [1 if re.search("Democrat", i) else 0 for i in train_items]
        dev_dem = [1 if re.search("Democrat", i) else 0 for i in dev_items]
        test_dem = [1 if re.search("Democrat", i) else 0 for i in test_items]

        train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)]
        dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)]
        test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)]

        # train with early stopping on validation set

        best_f1 = -np.inf
        params["clr"] = params["lr"]
        for e in xrange(params["n_epochs"]):
            # shuffle
            shuffle([train_lex, train_y, train_extra], params["seed"])  # shuffle the input data
            params["ce"] = e  # store the current epoch
            tic = timeit.default_timer()

            # for i, (x, y) in enumerate(zip(train_lex, train_y)):
            for i, orig_x in enumerate(train_lex):
                n_words = len(orig_x)
                if params["add_OOV_noise"]:
                    draws = np.random.rand(n_words)
                    x = [OOV_index if draws[idx] < params["OOV_noise_prob"] else orig_x[idx] for idx in range(n_words)]
                else:
                    x = orig_x
                y = train_y[i]
                extra = train_extra[i]

                if i == 0:
                    print " ".join([idx2words[w] for w in train_lex[i]])

                if i == 0:
                    print x
                    print y

                nll = rnn.train(x, y, params["win"], params["clr"], params["lr_emb_fac"], extra_input_dims, extra)
                if float(i / 100.0) == float(i // 100):
                    print nll
                print "[learning] epoch %i >> %2.2f%%" % (e, (i + 1) * 100.0 / float(n_sentences)),
                print "completed in %.2f (sec) <<\r" % (timeit.default_timer() - tic),
                sys.stdout.flush()
                # if i == 0:
                #    print ' '.join([idx2words[idx] for idx in orig_x])
                #    print rnn.classify(orig_x, params['win'], extra_input_dims, extra)

                if np.isnan(nll) or np.isinf(nll):
                    return {"loss": nll, "final_test_f1": 0, "valid_f1s": [0], "test_f1s": [0], "status": STATUS_OK}

            # evaluation // back into the real world : idx -> words
            print ""

            # print rnn.classify((np.asarray(contextwin(train_lex[0], params['win'])).astype('int32')), train_likes[0], params['win'])
            # print rnn.classify(train_lex[0], params['win'], extra_input_dims, train_extra[0])
            # print rnn.get_element_weights(np.asarray(contextwin(train_lex[0], params['win'])).astype('int32'))
            # if params['pooling_method'] == 'attention1' or params['pooling_method'] == 'attention2':
            #    if extra_input_dims == 0:
            #        r = np.random.randint(0, len(train_lex))
            #        print r, rnn.a_sum_check(np.asarray(contextwin(train_lex[r], params['win'])).astype('int32'))

            """
            predictions_train = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0)
                                 for x in train_lex]
            predictions_test = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0)
                                for x in test_lex]
            predictions_valid = [np.max(rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')), axis=0)
                                 for x in valid_lex]
            """

            # predictions_train = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in train_lex]
            # predictions_test = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in test_lex]
            # predictions_valid = [rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32'), likes) for x in valid_lex]

            predictions_train = [
                rnn.classify(x, params["win"], extra_input_dims, train_extra[i]) for i, x in enumerate(train_lex)
            ]
            predictions_test = [
                rnn.classify(x, params["win"], extra_input_dims, test_extra[i]) for i, x in enumerate(test_lex)
            ]
            predictions_valid = [
                rnn.classify(x, params["win"], extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex)
            ]

            train_f1 = common.calc_mean_f1(predictions_train, train_y)
            test_f1 = common.calc_mean_f1(predictions_test, test_y)
            valid_f1 = common.calc_mean_f1(predictions_valid, valid_y)

            if reuser is not None:
                valid_f1 = reuser.mask_value(valid_f1, train_f1)

            question_f1s = []
            question_pps = []

            print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1
            results.append((train_f1, valid_f1, test_f1))

            if valid_f1 > best_f1:
                best_rnn = copy.deepcopy(rnn)
                best_f1 = valid_f1
                best_test_predictions = predictions_test

                if params["verbose"]:
                    print ("NEW BEST: epoch", e, "valid f1", valid_f1, "best test f1", test_f1)

                params["tr_f1"] = train_f1
                params["te_f1"] = test_f1
                params["v_f1"] = valid_f1
                params["be"] = e  # store the current epoch as a new best

            # learning rate decay if no improvement in a given number of epochs
            if abs(params["be"] - params["ce"]) >= params["decay_delay"]:
                params["clr"] *= params["decay_factor"]
                params["be"] = params["ce"]
                print "Reverting to current best; new learning rate = ", params["clr"]
                # also reset to the previous best
                rnn = best_rnn

            if params["clr"] < 1e-5:
                break

            if best_f1 == 1.0:
                break

            if best_f1 == 0 and e > 10:
                break

        if params["save_model"]:
            predictions_valid = [
                rnn.classify(x, params["win"], extra_input_dims, dev_extra[i]) for i, x in enumerate(valid_lex)
            ]

            # predictions_valid = [best_rnn.classify(np.asarray(contextwin(x, params['win'])).astype('int32')) for x in valid_lex]
            best_rnn.save(output_dir)
            common.write_predictions(datasets, params["test_fold"], dev_fold, predictions_valid, dev_items, output_dir)

        print (
            "BEST RESULT: epoch",
            params["be"],
            "train F1 ",
            params["tr_f1"],
            "valid F1",
            params["v_f1"],
            "best test F1",
            params["te_f1"],
            "with the model",
            output_dir,
        )

        best_valid_f1s.append(params["v_f1"])
        best_test_f1s.append(params["te_f1"])

        test_prediction_arrays.append(np.array(best_test_predictions, dtype=int))

        output_filename = fh.make_filename(output_dir, "results", "txt")
        with codecs.open(output_filename, "w") as output_file:
            for e, result in enumerate(results):
                output_file.write(
                    "epoch="
                    + str(e)
                    + "; train_f1="
                    + str(result[0])
                    + "; valid_f1="
                    + str(result[1])
                    + "; test_f1="
                    + str(result[2])
                    + "\n"
                )

    if params["ensemble"]:
        test_predictions_stack = np.dstack(test_prediction_arrays)
        final_predictions = stats.mode(test_predictions_stack, axis=2)[0][:, :, 0]
        predicted_df = pd.DataFrame(final_predictions, index=test_items, columns=codes)
        true_df = pd.DataFrame(np.array(test_y), index=test_items, columns=codes)
        final_test_f1, final_test_pp = evaluation.calc_macro_mean_f1_pp(true_df, predicted_df)
    else:
        final_test_f1 = np.median(best_test_f1s)

    return {
        "loss": -np.median(best_valid_f1s),
        "final_test_f1": final_test_f1,
        "valid_f1s": best_valid_f1s,
        "test_f1s": best_test_f1s,
        "status": STATUS_OK,
    }
예제 #47
0
def create_setup():
    """Copy files from template and update them with user input."""
    global app_name, app_version, app_license, app_author, app_email, \
        app_url, app_keywords, DEFAULT_AUTHOR, DEFAULT_EMAIL, \
        DEFAULT_LICENSE, DEFAULT_URL, DEFAULT_VERSION

    data_lst = common.load_data()
    if data_lst:
        (DEFAULT_AUTHOR, DEFAULT_EMAIL, DEFAULT_LICENSE, DEFAULT_URL,
         DEFAULT_VERSION) = data_lst

    while not app_name:
        app_name = input(lcl.Q_APP_NAME).decode(lcl.INPUT_ENC)

    app_version = input(lcl.Q_APP_VERSION + '[' + DEFAULT_VERSION +
                        '] ').decode(lcl.INPUT_ENC)
    if not app_version:
        app_version = DEFAULT_VERSION

    app_license = input(lcl.Q_APP_LICENSE + '[' + DEFAULT_LICENSE +
                        '] ').decode(lcl.INPUT_ENC)
    if not app_license:
        app_license = DEFAULT_LICENSE

    app_author = input(lcl.Q_APP_AUTHOR + '[' + DEFAULT_AUTHOR +
                       '] ').decode(lcl.INPUT_ENC)
    if not app_author:
        app_author = DEFAULT_AUTHOR

    app_email = input(lcl.Q_APP_EMAIL + '[' + DEFAULT_EMAIL +
                      '] ').decode(lcl.INPUT_ENC)
    if not app_email:
        app_email = DEFAULT_EMAIL

    app_url = input(lcl.Q_APP_URL + '[' + DEFAULT_URL +
                    '] ').decode(lcl.INPUT_ENC)
    if not app_url:
        app_url = DEFAULT_URL

    app_keywords = input(lcl.Q_APP_KEYWORDS).decode(lcl.INPUT_ENC)
    if not app_keywords:
        app_keywords = app_name

    data_lst = [app_author, app_email, app_license, app_url, app_version]
    common.save_data(data_lst)

    app_url += app_name

    # backup existing files
    backup = False
    filenames = glob.glob('*')
    filenames += glob.glob('.*')
    if filenames:
        backup = True
        os.mkdir(BAK_DIR)
        for filename in filenames:
            dest = BAK_DIR + '/' + filename.split(os.sep)[-1]
            shu.move(filename, dest)

    filenames = glob.glob(common.DATA_PATH + 'template/*')
    filenames += glob.glob(common.DATA_PATH + 'template/.*')
    # remove doc dir
    filenames = [filename for filename in filenames
                 if 'template' + os.sep + 'doc' not in filename]

    # copy files and dirs
    for filename in filenames:
        if os.path.isfile(filename):
            shu.copyfile(filename, filename.split(os.sep)[-1])
        else:
            shu.copytree(filename, filename.split(os.sep)[-1])

    common.sleep(2)

    os.rename('APPLICATION_NAME', app_name)  # rename application dir

    # collect all filenames, including from 1st level subdirs
    filenames = glob.glob('*')
    filenames = [filename for filename in filenames if BAK_DIR not in filename]
    filenames += glob.glob('.*')
    new_filenames = []
    for filename in filenames:
        if os.path.isdir(filename):
            new_filenames += glob.glob(filename + '/*')
    filenames += new_filenames

    exceptions = ['__init__.py', 'build.cmd', 'requirements.txt',
                  'requirements-dev.txt', 'setup.py', 'setup_py2exe.py',
                  'setup_utils.py']

    # delete .pyc files and update files
    for filename in filenames:
        if os.path.isfile(filename):
            if '.pyc' in filename:
                os.remove(filename)
            else:
                if filename.split(os.sep)[-1] not in exceptions:
                    update_file(filename)

    create_redir2rtd_zip()

    if backup:
        os.remove(app_name + APPLICATION_TEMPLATE_FILE)  # remove app template
        # restore files from backup, but only if they don't already exist
        filenames = glob.glob(BAK_DIR + '/*')
        for filename in filenames:
            dest = app_name + '/' + filename.split(os.sep)[-1]
            if not os.path.isfile(dest):
                shu.copyfile(filename, dest)
    else:
        os.rename(app_name + APPLICATION_TEMPLATE_FILE,
                  app_name + '/' + app_name + '.py')  # rename app template

    print(lcl.REMINDERS)
예제 #48
0
def test_road_quality_module(filename, plot, test_type):
    # initialize module for testing
    train_values    = 1
    train_trees     = 10
    filename_train  = "train_data/road_quality_data.output"
    init_server.init_road_quality_module(filename_train, train_values, train_trees)

    if test_type == "full": # generate new dataset
        print bcolors.HEADER + "initialize dependent modules" + bcolors.ENDC
        init_server.init_speed_module   ("train_data/speed_acc_data.output",   10, 10)
        init_server.init_turns_module   ("train_data/turns_com_data.output",    5, 10)
        init_server.init_defects_module ("train_data/speed_acc_data.output",    5, 20)
        init_server.init_behavior_defects_module("train_data/behavior_defects_data.output", 1, 10) 
        print bcolors.OKGREEN + "Done! " + bcolors.ENDC
 
        # load dependent test data and classify actions
        # structure: time,accx,accy,accz,compass,lat,lon,speed
        test_values = 10
        
        print bcolors.HEADER + "Start getting speed data" + bcolors.ENDC
        test_speed_data = cmn.aver_std_array(cmn.load_data(filename, (2,)), test_values)
        test_speed_data = test_speed_data.reshape(len(test_speed_data)/2, 2)
        predicted_speed = df.predicted(test_speed_data)
        predicted_speed = predicted_speed.reshape(len(predicted_speed), 1)
        print bcolors.OKGREEN + "Done! speed data:\n" + bcolors.ENDC, predicted_speed

        print bcolors.HEADER + "Start getting turns data" + bcolors.ENDC
        test_turns_data = cmn.get_diff_array(cmn.load_data(filename, (4,)))
        test_turns_data = cmn.aver_std_array(test_turns_data, test_values)
        test_turns_data = test_turns_data.reshape(len(test_turns_data)/2, 2)
        predicted_turns = df.predicted(test_turns_data)
        predicted_turns = predicted_turns.reshape(len(predicted_turns), 1)
        print bcolors.OKGREEN + "Done! turns data:\n" + bcolors.ENDC, predicted_turns
   
        print bcolors.HEADER + "Start getting defects data" + bcolors.ENDC
        test_defects_data = cmn.aver_std_array(cmn.load_data(filename, (3,)), test_values)
        test_defects_data = test_defects_data.reshape(len(test_defects_data)/2, 2)
        predicted_defects = df.predicted(test_defects_data)
        predicted_defects = predicted_defects.reshape(len(predicted_defects), 1)
        print bcolors.OKGREEN + "Done! defects data:\n" + bcolors.ENDC, predicted_defects

        print bcolors.HEADER + "Start getting behavior defects data" + bcolors.ENDC
        test_behavior_defects_data = np.hstack((predicted_speed, predicted_turns, predicted_defects))
        predicted_behavior_defects = bd.predicted(test_behavior_defects_data)
        predicted_behavior_defects = predicted_behavior_defects.reshape(len(predicted_behavior_defects), 1)
        print bcolors.OKGREEN + "Done! defects data:\n" + bcolors.ENDC, predicted_defects

        print bcolors.HEADER + "Start generating test data" + bcolors.ENDC
        test_times = cmn.label_array(cmn.load_data(filename, (0,)), test_values)
        test_data  = cmn.sum_array(predicted_behavior_defects, test_values)
        print bcolors.OKGREEN + "Done! test_data:\n" + bcolors.ENDC, test_data
        return

    elif test_type == "express":  
        # use default dataset
        # structure: time,low_defects,high_defects,lat,lon,label
        print bcolors.HEADER + "Start getting test data" + bcolors.ENDC
        test_values = 1
        test_low_defects_data   = cmn.label_array(cmn.load_data(filename, (1,)), values)
        test_low_defects_data   = test_low_defects_data.reshape(len(test_low_defects_data), 1)
        test_high_defects_data  = cmn.label_array(cmn.load_data(filename, (2,)), values)
        test_high_defects_data  = test_high_defects_data.reshape(len(test_high_defects_data), 1)
        test_tent_defects_data  = cmn.label_array(cmn.load_data(filename, (3,)), values)
        test_tent_defects_data  = test_tent_defects_data.reshape(len(test_tent_defects_data), 1)

        test_data = np.hstack((test_low_defects_data, test_high_defects_data, test_tent_defects_data))
        test_times = cmn.label_array(cmn.load_data(filename, (0,)), test_values)
        print bcolors.OKGREEN + "Done! test_data:\n" + bcolors.ENDC, test_data
    else :
        print bcolor.FAIL + "road_quality_module: invalid test type, exit" + bcolors.ENDC
        return

    # choose test data sources by test type
    if test_type == "full":
        # generate new dataset
        init_server.init_speed_module   ("train_data/speed_acc_data.output",   10, 10)
        init_server.init_turns_module   ("train_data/turns_acc_data.output",   10, 10)
        init_server.init_defects_module ("train_data/defects_acc_data.output",  5, 20)
        init_server.init_behavior_defects_module("train_data/behavior_defects_data.output", 10, 10)
        
    elif test_type == "express":
        # use default dataset
        # structure: time,low_defects,high_defects
        test_values = 1
        test_data   = cmn.aver_std_array(cmn.load_data(filename, (2,)), test_values)
        test_times  = cmn.label_array(cmn.load_data(filename, (0,)), test_values)
        test_data = test_data.reshape(len(test_data)/2, 2)
        rq.find_actions(test_data, test_times)

    else :
        print "road_quality_module: invalid test type, exit"
        return

    # analysis of set of defects and determine road quality

    # compare with previous results of road quality analysis

    # update road quality using voting procedure
    return
예제 #49
0
def main(params=None):

    if params is None:
        params = {
            'dataset': 'DRLD',
            'exp_name': 'best_minibatch_mod',
            'test_fold': 0,
            'n_dev_folds': 1,
            'min_doc_thresh': 1,
            'initialize_word_vectors': False,
            'vectors': 'anes_word2vec_300',  # default_word2vec_300, anes_word2vec_300, chars_word2vec_25, eye_1 ...
            'init_scale': 0.2,
            'add_OOV_dim': False,
            'win': 1,                   # size of context window
            'add_DRLD': False,
            'rnn_type': 'LSTM',        # basic, GRU, or LSTM
            'n_hidden': 50,             # size of hidden units
            'pooling_method': 'last',    # max, mean, or attention1/2
            'bidirectional': False,
            'bi_combine': 'concat',        # concat, max, or mean
            'train_embeddings': False,
            'lr': 0.025,                  # learning rate
            'lr_emb_fac': 0.2,            # factor to modify learning rate for embeddings
            'decay_delay': 5,           # number of epochs with no improvement before decreasing learning rate
            'decay_factor': 0.5,        # factor by which to multiply learning rate in case of delay
            'n_epochs': 100,
            'add_OOV_noise': False,
            'OOV_noise_prob': 0.01,
            'minibatch_size': 1,
            'classify_minibatch_size': 1,
            'ensemble': False,
            'save_model': True,
            'seed': 42,
            'verbose': 1,
            'reuse': False,
            'orig_T': 0.04,
            'tau': 0.01,
            'xavier_init': True
        }

    params = fh.read_json('/Users/dcard/Projects/CMU/ARK/guac/experiments/rnn/bayes_opt_rnn_LSTM_reuse_mod_34_rerun/params.txt')
    params['n_hidden'] = int(params['n_hidden'])

    keys = params.keys()
    keys.sort()
    for key in keys:
        print key, ':', params[key]

    # seed the random number generators
    np.random.seed(params['seed'])
    random.seed(params['seed'])

    vector_type = params['vectors'].split('_')[0]
    params['word2vec_dim'] = int(params['vectors'].split('_')[-1])


    reuser = None
    if params['reuse']:
        reuser = reusable_holdout.ReuseableHoldout(T=params['orig_T'], tau=params['tau'])

    if params['dataset'] == 'DRLD':
        datasets = ['Democrat-Likes', 'Democrat-Dislikes', 'Republican-Likes', 'Republican-Dislikes']
    elif params['dataset'] == 'MIP':
        datasets = ['MIP-Personal-1', 'MIP-Personal-2', 'MIP-Political-1', 'MIP-Political-2']
    elif params['dataset'] == 'MOLD':
        datasets = ['McCain-Likes', 'McCain-Dislikes', 'Obama-Likes', 'Obama-Dislikes']
    elif params['dataset'] == 'Primary':
        datasets = ['Obama-Primary', 'Clinton-Primary']
    elif params['dataset'] == 'General':
        datasets = ['Obama-General', 'McCain-General']
    else:
        datasets = [params['dataset']]

    np.random.seed(params['seed'])
    random.seed(params['seed'])

    best_valid_f1s = []
    best_true_valid_f1s = []
    best_test_f1s = []
    best_train_f1s = []

    test_prediction_arrays = []

    output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'])
    output_filename = fh.make_filename(output_dir, 'params', 'txt')
    fh.write_to_json(params, output_filename)

    for dev_fold in range(params['n_dev_folds']):
        print "dev fold =", dev_fold

        output_dir = fh.makedirs(defines.exp_dir, 'rnn', params['exp_name'], 'fold' + str(dev_fold))

        all_data, words2idx, items, all_labels = common.load_data(datasets, params['test_fold'], dev_fold,
                                                                  params['min_doc_thresh'])
        train_xy, valid_xy, test_xy = all_data
        train_lex, train_y = train_xy
        valid_lex, valid_y = valid_xy
        test_lex, test_y = test_xy

        train_lengths = [len(x) for x in train_lex]
        length_order = np.argsort(train_lengths)

        #if params['minibatch_size'] > 1 or params['classify_minibatch_size'] > 1:
        print "padding input with zeros"
        #all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex, preset_max=100)
        all_data, all_masks = common.prepare_data(train_lex, valid_lex, test_lex)
        train_lex, valid_lex, test_lex = all_data
        train_masks, valid_masks, test_masks = all_masks
        #else:
        #    train_masks = [np.ones(len(x)).astype('int32') for x in train_lex]
        #    valid_masks = [np.ones(len(x)).astype('int32') for x in valid_lex]
        #    test_masks = [np.ones(len(x)).astype('int32') for x in test_lex]

        print "expanding x with context win dows"
        # Rejigger to convert x to contex win in advance
        train_x_win = expand_x_with_context_win(train_lex, params['win'])
        valid_x_win = expand_x_with_context_win(valid_lex, params['win'])
        test_x_win = expand_x_with_context_win(test_lex, params['win'])
        order = range(len(train_lex))
        print "done"

        train_items, dev_items, test_items = items
        vocsize = len(words2idx.keys())
        idx2words = dict((k, v) for v, k in words2idx.iteritems())
        best_test_predictions = None

        n_sentences = len(train_lex)
        print "vocsize = ", vocsize, 'n_train', n_sentences

        codes = all_labels.columns
        n_items, n_codes = all_labels.shape

        # get the words in the sentences for the test and validation sets
        words_valid = [map(lambda x: idx2words[x], w) for w in valid_lex]
        groundtruth_test = test_y[:]
        words_test = [map(lambda x: idx2words[x], w) for w in test_lex]

        #if vector_type == 'eye':
        #    initial_embeddings = np.eye(vocsize)
        #    emb_dim = initial_embeddings.shape[1]
        if params['initialize_word_vectors']:
            initial_embeddings = common.load_embeddings(params, words2idx)
            emb_dim = initial_embeddings.shape[1]
        else:
            initial_embeddings = None
            emb_dim = params['word2vec_dim']
        print "embedding dim =", emb_dim

        extra_input_dims = 0
        if params['add_DRLD']:
            #extra_input_dims = 4
            extra_input_dims = 2

        print "Building RNN"
        rnn = RNN(nh=params['n_hidden'],
                  nc=n_codes,
                  ne=vocsize,
                  de=emb_dim,
                  cs=params['win'],
                  extra_input_dims=extra_input_dims,
                  initial_embeddings=initial_embeddings,
                  init_scale=params['init_scale'],
                  rnn_type=params['rnn_type'],
                  train_embeddings=params['train_embeddings'],
                  pooling_method=params['pooling_method'],
                  bidirectional=params['bidirectional'],
                  bi_combine=params['bi_combine'],
                  xavier_init=params['xavier_init']
                  )

        # add extra dimensions to differentiate between paired datasets
        train_likes = [1 if re.search('Likes', i) else 0 for i in train_items]
        dev_likes = [1 if re.search('Likes', i) else 0 for i in dev_items]
        test_likes = [1 if re.search('Likes', i) else 0 for i in test_items]

        train_dem = [1 if re.search('Democrat', i) else 0 for i in train_items]
        dev_dem = [1 if re.search('Democrat', i) else 0 for i in dev_items]
        test_dem = [1 if re.search('Democrat', i) else 0 for i in test_items]

        """
        train_obama = [1 if re.search('Obama', i) else 0 for i in train_items]
        dev_obama = [1 if re.search('Obama', i) else 0 for i in dev_items]
        test_obama = [1 if re.search('Obama', i) else 0 for i in test_items]

        train_personal = [1 if re.search('Personal', i) else 0 for i in train_items]
        dev_personal = [1 if re.search('Personal', i) else 0 for i in dev_items]
        test_personal = [1 if re.search('Personal', i) else 0 for i in test_items]

        train_extra = [[train_likes[i], train_dem[i], train_obama[i], train_personal[i]] for i, t in enumerate(train_items)]
        dev_extra = [[dev_likes[i], dev_dem[i], dev_obama[i], dev_personal[i]] for i, t in enumerate(dev_items)]
        test_extra = [[test_likes[i], test_dem[i], test_obama[i], test_personal[i]] for i, t in enumerate(test_items)]
        """

        train_extra = [[train_likes[i], train_dem[i]] for i, t in enumerate(train_items)]
        dev_extra = [[dev_likes[i], dev_dem[i]] for i, t in enumerate(dev_items)]
        test_extra = [[test_likes[i], test_dem[i]] for i, t in enumerate(test_items)]


        ### LOAD
        rnn.load(output_dir)

        # train with early stopping on validation set
        best_f1 = -np.inf
        params['clr'] = params['lr']
        n_train = len(order)



        predictions_train = predict(n_train, params['classify_minibatch_size'], train_x_win, train_masks,
                                     train_y, params['win'], extra_input_dims, train_extra, rnn, order)
        n_valid = len(valid_lex)
        n_test = len(test_lex)
        predictions_valid = predict(n_valid, params['classify_minibatch_size'], valid_x_win, valid_masks,
                                    valid_y, params['win'], extra_input_dims, dev_extra, rnn)
        predictions_test = predict(n_test, params['classify_minibatch_size'], test_x_win, test_masks,
                                    test_y, params['win'], extra_input_dims, test_extra, rnn)

        """
        predictions_train = [rnn.classify(x, train_masks[i], params['win'],
                                          extra_input_dims, train_extra[i])[0] for i, x in enumerate(train_lex)]
        predictions_valid = [rnn.classify(x, valid_masks[i], params['win'],
                                          extra_input_dims, dev_extra[i])[0] for i, x in enumerate(valid_lex)]
        predictions_test = [rnn.classify(x, test_masks[i], params['win'],
                                         extra_input_dims, test_extra[i])[0] for i, x in enumerate(test_lex)]
        """

        train_f1 = common.calc_mean_f1(predictions_train, train_y)
        test_f1 = common.calc_mean_f1(predictions_test, test_y)
        valid_f1 = common.calc_mean_f1(predictions_valid, valid_y)

        output_dir = fh.makedirs(output_dir, 'responses')

        ms = 1

        for i in range(n_train):
            mb_x, mb_masks, mb_extra, mb_y = select_minibatch(train_x_win, train_masks, train_extra, train_y,
                                                              params['win'], i, ms, order=range(len(train_y)))

            h, W, b, p_y, s, i_f, i_r, \
                f_f, f_r, o_f, o_r, c = rnn.step_through(mb_x, mb_masks, params['win'], extra_input_dims, mb_extra)

            temp = np.dot(h, W) + b
            s = 1.0/(1.0 + np.exp(-temp))
            output_filename = fh.make_filename(output_dir, train_items[i], 'csv')
            np.savetxt(output_filename, s[:, 0, :], delimiter=',')
            output_npy_files(output_dir, train_items[i], i_f, i_r, f_f, f_r, o_f, o_r, h, c)

        for i in range(n_valid):
            mb_x, mb_masks, mb_extra, mb_y = select_minibatch(valid_x_win, valid_masks, dev_extra, valid_y,
                                                              params['win'], i, ms, order=range(len(valid_y)))

            h, W, b, p_y, s, i_f, i_r, \
                f_f, f_r, o_f, o_r, c = rnn.step_through(mb_x, mb_masks, params['win'], extra_input_dims, mb_extra)

            temp = np.dot(h, W) + b
            s = 1.0/(1.0 + np.exp(-temp))
            output_filename = fh.make_filename(output_dir, dev_items[i], 'csv')
            np.savetxt(output_filename, s[:, 0, :], delimiter=',')
            output_npy_files(output_dir, dev_items[i], i_f, i_r, f_f, f_r, o_f, o_r, h, c)

        for i in range(n_test):
            mb_x, mb_masks, mb_extra, mb_y = select_minibatch(test_x_win, test_masks, test_extra, test_y,
                                                              params['win'], i, ms, order=range(len(test_y)))

            h, W, b, p_y, s, i_f, i_r,\
                f_f, f_r, o_f, o_r, c = rnn.step_through(mb_x, mb_masks, params['win'], extra_input_dims, mb_extra)

            temp = np.dot(h, W) + b
            s = 1.0/(1.0 + np.exp(-temp))
            output_filename = fh.make_filename(output_dir, test_items[i], 'csv')
            np.savetxt(output_filename, s[:, 0, :], delimiter=',')
            output_npy_files(output_dir, test_items[i], i_f, i_r, f_f, f_r, o_f, o_r, h, c)

        print "train_f1 =", train_f1, "valid_f1 =", valid_f1, "test_f1 =", test_f1
예제 #50
0
 def setUp(self):
     data = load_data('report.json')
     self.records = data['asg']['records']
     self.headers = data['asg']['headers']
     self.rows = data['asg']['rows']
예제 #51
0
def preprocess(record, stopword=False, filtered_post_tag=False, basic_word = False, lemmatize=True):
  review_str = record['review'].decode('UTF-8')
  review_str = review_str.replace('.', '. ')

  tokens = word_tokenize(review_str)
  
  preprocessed_string = [preprocess_word(word, pos_tag, stopword, filtered_post_tag, lemmatize, basic_word)
                         for (word, pos_tag) in nltk.tag._pos_tag(tokens, None, tagger)]
  preprocessed_string = [word for word in preprocessed_string if word != ""]

  record['review'] = u' '.join(preprocessed_string).encode('utf-8').strip()
  return record

if __name__ == '__main__':
  # Preprocess train data
  records = load_data('data/reviews.tsv')

  preprocess_records = [preprocess(record, stopword=False, basic_word = True, lemmatize=True) for record in records]

  with open('data/preprocessed_reviews.tsv', 'w') as preprocess_file:
    header = 'id\treview\tsentiment\n'
    preprocess_file.write(header)

    for record in records:
      try:
        preprocess_file.write('%s\t%s\t%i\n' %
                            (record['id'].decode('UTF-8'), record['review'].decode('UTF-8'), record['sentiment']))
      except UnicodeEncodeError:
        print("unicode encode error")
        continue
예제 #52
0
import common

'''tutaj leci kod''' 

def calc_price(word):
	price = {}
	for i in word:
		if(price.has_key(i)):
			price[i] += 1
		else:
			price[i] = 1
	return price;


i = 2;
data = common.load_data('../dane/sets/inter0' + str(i) + '.in')
alph = 'abcdefghijklmnopqrstuvwxyz'
al = {}

left_data = data[0]
left_data = left_data.split(' ')

for i in range(0,26):
	al[alph[i]] = int(left_data[i])

lines = int(data[1])

words = []
sents = []

#read lines
예제 #53
0
        pickup = np.array([row["pickup_longitude"],
                           row["pickup_latitude"]])
        dropoff = np.array([row["dropoff_longitude"],
                            row["dropoff_latitude"]])
        _, p_label = stations_kd.query(pickup)
        _, d_label = stations_kd.query(dropoff)
        freqs[interval][wday][p_label][d_label] += 1
    del df
    return freqs


if __name__ == "__main__":
    n_workers = 8
    chunksize = 100000
    pool = Pool(n_workers, maxtasksperchild=1)
    dfs = common.load_data(chunksize)
    freqs = np.zeros((intervals_per_day, 7, n_stations, n_stations),
                     dtype=np.int)
    print "Computing probabilities..."

    stuff_to_do = True
    pbar = tqdm.tqdm(total=n_lines)
    while stuff_to_do:
        sub_dfs = list()
        for i in xrange(n_workers):
            try:
                sub_dfs.append(next(dfs))
            except StopIteration:
                stuff_to_do = False
        if len(sub_dfs) > 0:
            freqs += sum(pool.map(calc_freqs, sub_dfs))
예제 #54
0
 def on_timer_task(self, event):
     self.users = common.load_data("users.pickle")
     event.reactor.schedule(self.user_reread_period, self)